From bcbb85af7668a17c6498200f4675a6ac41d868f6 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Thu, 23 May 2024 19:34:04 +0800
Subject: [PATCH 01/18] fixed version for paddlepaddle. (#3701)

* fixed version for paddlepaddle.

* fix code style
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index af7c4dc3..8e81da6d 100644
--- a/setup.py
+++ b/setup.py
@@ -53,6 +53,7 @@ base = [
     "pandas",
     "paddleaudio>=1.1.0",
     "paddlenlp>=2.4.8",
+    "paddlepaddle==2.5.1",
     "paddleslim>=2.3.4",
     "ppdiffusers>=0.9.0",
     "paddlespeech_feat",

From 03022f2170ce76d2ca8385a92aa8df3519e2366b Mon Sep 17 00:00:00 2001
From: mjxs <52824616+kk-2000@users.noreply.github.com>
Date: Tue, 4 Jun 2024 10:34:39 +0800
Subject: [PATCH 02/18] =?UTF-8?q?=E3=80=90Fix=20Speech=20Issue=20No.5?=
 =?UTF-8?q?=E3=80=91issue=203444=20transformation=20import=20error=20(#377?=
 =?UTF-8?q?9)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix paddlespeech.s2t.transform.transformation import error

* fix paddlespeech.s2t.transform import error
---
 audio/tests/features/base.py                    | 2 +-
 audio/tests/features/test_istft.py              | 4 ++--
 audio/tests/features/test_log_melspectrogram.py | 2 +-
 audio/tests/features/test_spectrogram.py        | 2 +-
 audio/tests/features/test_stft.py               | 2 +-
 docs/tutorial/asr/tutorial_transformer.ipynb    | 4 ++--
 utils/apply-cmvn.py                             | 2 +-
 utils/compute-cmvn-stats.py                     | 2 +-
 utils/copy-feats.py                             | 2 +-
 utils/feat-to-shape.py                          | 2 +-
 10 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/audio/tests/features/base.py b/audio/tests/features/base.py
index d183b72a..3bb1d1dd 100644
--- a/audio/tests/features/base.py
+++ b/audio/tests/features/base.py
@@ -37,7 +37,7 @@ class FeatTest(unittest.TestCase):
         self.waveform, self.sr = load(os.path.abspath(os.path.basename(url)))
         self.waveform = self.waveform.astype(
             np.float32
-        )  # paddlespeech.s2t.transform.spectrogram only supports float32 
+        )  # paddlespeech.audio.transform.spectrogram only supports float32
         dim = len(self.waveform.shape)
 
         assert dim in [1, 2]
diff --git a/audio/tests/features/test_istft.py b/audio/tests/features/test_istft.py
index 9cf8cdd6..ea1ee5cb 100644
--- a/audio/tests/features/test_istft.py
+++ b/audio/tests/features/test_istft.py
@@ -18,8 +18,8 @@ import paddle
 from paddleaudio.functional.window import get_window
 
 from .base import FeatTest
-from paddlespeech.s2t.transform.spectrogram import IStft
-from paddlespeech.s2t.transform.spectrogram import Stft
+from paddlespeech.audio.transform.spectrogram import IStft
+from paddlespeech.audio.transform.spectrogram import Stft
 
 
 class TestIstft(FeatTest):
diff --git a/audio/tests/features/test_log_melspectrogram.py b/audio/tests/features/test_log_melspectrogram.py
index 7d568038..b2765d3b 100644
--- a/audio/tests/features/test_log_melspectrogram.py
+++ b/audio/tests/features/test_log_melspectrogram.py
@@ -18,7 +18,7 @@ import paddle
 import paddleaudio
 
 from .base import FeatTest
-from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogram
+from paddlespeech.audio.transform.spectrogram import LogMelSpectrogram
 
 
 class TestLogMelSpectrogram(FeatTest):
diff --git a/audio/tests/features/test_spectrogram.py b/audio/tests/features/test_spectrogram.py
index 5fe5afee..6f460963 100644
--- a/audio/tests/features/test_spectrogram.py
+++ b/audio/tests/features/test_spectrogram.py
@@ -18,7 +18,7 @@ import paddle
 import paddleaudio
 
 from .base import FeatTest
-from paddlespeech.s2t.transform.spectrogram import Spectrogram
+from paddlespeech.audio.transform.spectrogram import Spectrogram
 
 
 class TestSpectrogram(FeatTest):
diff --git a/audio/tests/features/test_stft.py b/audio/tests/features/test_stft.py
index 58792ffe..9511a292 100644
--- a/audio/tests/features/test_stft.py
+++ b/audio/tests/features/test_stft.py
@@ -18,7 +18,7 @@ import paddle
 from paddleaudio.functional.window import get_window
 
 from .base import FeatTest
-from paddlespeech.s2t.transform.spectrogram import Stft
+from paddlespeech.audio.transform.spectrogram import Stft
 
 
 class TestStft(FeatTest):
diff --git a/docs/tutorial/asr/tutorial_transformer.ipynb b/docs/tutorial/asr/tutorial_transformer.ipynb
index dc303006..77aed4bf 100644
--- a/docs/tutorial/asr/tutorial_transformer.ipynb
+++ b/docs/tutorial/asr/tutorial_transformer.ipynb
@@ -236,8 +236,8 @@
     "warnings.filterwarnings('ignore')\n",
     "\n",
     "from yacs.config import CfgNode\n",
-    "from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogramKaldi\n",
-    "from paddlespeech.s2t.transform.cmvn import GlobalCMVN\n",
+    "from paddlespeech.audio.transform.spectrogram import LogMelSpectrogramKaldi\n",
+    "from paddlespeech.audio.transform.cmvn import GlobalCMVN\n",
     "from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer\n",
     "from paddlespeech.s2t.models.u2 import U2Model\n",
     "\n",
diff --git a/utils/apply-cmvn.py b/utils/apply-cmvn.py
index cf91bdfc..fa69ff8e 100755
--- a/utils/apply-cmvn.py
+++ b/utils/apply-cmvn.py
@@ -6,7 +6,7 @@ import kaldiio
 import numpy
 from distutils.util import strtobool
 
-from paddlespeech.s2t.transform.cmvn import CMVN
+from paddlespeech.audio.transform.cmvn import CMVN
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
diff --git a/utils/compute-cmvn-stats.py b/utils/compute-cmvn-stats.py
index 276bcd36..763347ce 100755
--- a/utils/compute-cmvn-stats.py
+++ b/utils/compute-cmvn-stats.py
@@ -5,7 +5,7 @@ import logging
 import kaldiio
 import numpy as np
 
-from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
diff --git a/utils/copy-feats.py b/utils/copy-feats.py
index dc7a70b4..89ea30f9 100755
--- a/utils/copy-feats.py
+++ b/utils/copy-feats.py
@@ -4,7 +4,7 @@ import logging
 
 from distutils.util import strtobool
 
-from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
diff --git a/utils/feat-to-shape.py b/utils/feat-to-shape.py
index bbc9242f..e5e014de 100755
--- a/utils/feat-to-shape.py
+++ b/utils/feat-to-shape.py
@@ -3,7 +3,7 @@ import argparse
 import logging
 import sys
 
-from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

From 09e5d8a4ac03f29c2ce6511e1a3c39136cd3e29b Mon Sep 17 00:00:00 2001
From: Mattheliu <leonliuzx@outlook.com>
Date: Wed, 5 Jun 2024 10:41:32 +0800
Subject: [PATCH 03/18] =?UTF-8?q?=E3=80=90Fix=20Speech=20Issue=20No.8?=
 =?UTF-8?q?=E3=80=91issue=203652=20merge=5Fyi=20function=20has=20a=20bug?=
 =?UTF-8?q?=20(#3786)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 【Fix Speech Issue No.8】issue 3652 merge_yi function has a bug

* 【Fix Speech Issue No.8】issue 3652 merge_yi function has a bug
---
 paddlespeech/t2s/frontend/tone_sandhi.py | 25 ++++++++++--------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py
index 690f69aa..3558064c 100644
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -237,30 +237,25 @@ class ToneSandhi():
     # output seg: [['听一听', 'v']]
     def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
         new_seg = []
+        skip_next = False
         # function 1
         for i, (word, pos) in enumerate(seg):
-            if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][
-                    0] == seg[i + 1][0] and seg[i - 1][1] == "v":
-                if i - 1 < len(new_seg):
-                    new_seg[i -
-                            1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
-                else:
-                    new_seg.append([word, pos])
-                    new_seg.append([seg[i + 1][0], pos])
+            if skip_next:
+                skip_next = False
+                continue
+            if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v":
+                new_seg[-1] = (new_seg[-1][0] + "一" + seg[i + 1][0], new_seg[-1][1])
+                skip_next = True
             else:
-                if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][
-                        0] == word and pos == "v":
-                    continue
-                else:
-                    new_seg.append([word, pos])
+                new_seg.append((word, pos))
         seg = new_seg
         new_seg = []
         # function 2
         for i, (word, pos) in enumerate(seg):
             if new_seg and new_seg[-1][0] == "一":
-                new_seg[-1][0] = new_seg[-1][0] + word
+                new_seg[-1] = (new_seg[-1][0] + word, new_seg[-1][1])
             else:
-                new_seg.append([word, pos])
+                new_seg.append((word, pos))
         return new_seg
 
     # the first and the second words are all_tone_three

From 05660a62cb2f56c1af0773be06a75d8dbc18df20 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Wed, 5 Jun 2024 14:28:14 +0800
Subject: [PATCH 04/18] =?UTF-8?q?=E3=80=90test=E3=80=91add=20cli=20test=20?=
 =?UTF-8?q?readme=20(#3784)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add cli test readme

* fix code style
---
 tests/unit/cli/test_cli.sh |  2 ++
 tests/unit/doc/test_cli.md | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 tests/unit/doc/test_cli.md

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index a7f7d11e..3bc2eae2 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -110,5 +110,7 @@ paddlespeech whisper --task transcribe --input ./zh.wav
 # whisper recognize text and translate to English
 paddlespeech whisper --task translate --input ./zh.wav
 
+# to change model English-Only model
+paddlespeech whisper --lang en --size base --task transcribe  --input ./en.wav
 
 echo -e "\033[32mTest success !!!\033[0m"
diff --git a/tests/unit/doc/test_cli.md b/tests/unit/doc/test_cli.md
new file mode 100644
index 00000000..34a0c016
--- /dev/null
+++ b/tests/unit/doc/test_cli.md
@@ -0,0 +1,29 @@
+# test CLI 测试文档
+
+ 该文档为 CLI 测试说明，该测试目前覆盖大部分 paddlespeech 中的 CLI 推理。该 CI 建立后用于快速验证修复是否正确。
+
+ # 测试流程
+ ## 1. 环境安装
+
+ CI 重建时在已有通过版本 paddlepaddle-gpu==2.5.1, paddlepseech==develop 下运行。
+
+ CI 重建后在 paddlepaddle-gpu==develop, paddlepseech==develop 下运行。
+ 
+ ### 其他相关依赖
+
+ gcc >= 4.8.5,
+ python >= 3.8
+
+ ## 2. 功能测试
+
+ 在 repo 的 tests/unit/cli 中运行：
+
+  ```shell
+
+  source path.sh
+  bash test_cli.sh
+
+  ```
+## 3. 预期结果
+
+ 输出 "Test success"，且运行过程中无报错或 Error 即为成功。

From 72ce8861779cc7fef9eb3277217878fd65375c58 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Thu, 6 Jun 2024 15:26:16 +0800
Subject: [PATCH 05/18] =?UTF-8?q?=E3=80=90test=E3=80=91fix=20test=20cli=20?=
 =?UTF-8?q?bug=20(#3793)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add cli test readme

* fix code style

* fix bug
---
 tests/unit/cli/test_cli.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 3bc2eae2..3903e659 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -10,11 +10,12 @@ paddlespeech cls --input ./cat.wav --topk 10
 paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast
 
 # Speech SSL
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 paddlespeech ssl --task asr --lang en --input ./en.wav
 paddlespeech ssl --task vector --lang en --input ./en.wav
 
 # Speech_recognition
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 paddlespeech asr --input ./zh.wav
 paddlespeech asr --model conformer_aishell --input ./zh.wav
 paddlespeech asr --model conformer_online_aishell --input ./zh.wav

From e8018a11ce73176549d92ddbac9bc4b0bbdd2157 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Fri, 7 Jun 2024 14:11:36 +0800
Subject: [PATCH 06/18] Update setup.py (#3795)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 8e81da6d..10a6502c 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@ base = [
     "pandas",
     "paddleaudio>=1.1.0",
     "paddlenlp>=2.4.8",
-    "paddlepaddle==2.5.1",
+    "paddlepaddle-gpu==2.5.1",
     "paddleslim>=2.3.4",
     "ppdiffusers>=0.9.0",
     "paddlespeech_feat",

From 91170bd2604e5a22237fcb46ebcf44f4d86914b5 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Tue, 11 Jun 2024 11:12:58 +0800
Subject: [PATCH 07/18] adapt view behavior change, fix KeyError. (#3794)

* adapt view behavior change, fix KeyError.

* fix readme demo run error.

* fixed opencc version
---
 paddlespeech/cli/asr/infer.py                    | 2 +-
 paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py | 2 +-
 setup.py                                         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 4001f957..231a00f4 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -274,7 +274,7 @@ class ASRExecutor(BaseExecutor):
             # fbank
             audio = preprocessing(audio, **preprocess_args)
 
-            audio_len = paddle.to_tensor([audio.shape[0]]).unsqueeze(axis=0)
+            audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0)
             audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
 
             self._inputs["audio"] = audio
diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
index a3744d34..64195def 100755
--- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
+++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
@@ -188,7 +188,7 @@ class Wav2vec2ASR(nn.Layer):
         x_lens = x.shape[1]
         ctc_probs = self.ctc.log_softmax(x)  # (B, maxlen, vocab_size)
         topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
-        topk_index = topk_index.view([batch_size, x_lens])  # (B, maxlen)
+        topk_index = topk_index.reshape([batch_size, x_lens])  # (B, maxlen)
 
         hyps = [hyp.tolist() for hyp in topk_index]
         hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
diff --git a/setup.py b/setup.py
index 10a6502c..030f7f88 100644
--- a/setup.py
+++ b/setup.py
@@ -48,7 +48,7 @@ base = [
     "matplotlib",
     "nara_wpe",
     "onnxruntime>=1.11.0",
-    "opencc",
+    "opencc==1.1.6",
     "opencc-python-reimplemented",
     "pandas",
     "paddleaudio>=1.1.0",

From 98fe6d1153e59cbaf9653e00695e061169ec683c Mon Sep 17 00:00:00 2001
From: gmm <38800877+mmglove@users.noreply.github.com>
Date: Wed, 19 Jun 2024 17:36:59 +0800
Subject: [PATCH 08/18] =?UTF-8?q?=E3=80=90benchmark=E3=80=91fix=20benchmar?=
 =?UTF-8?q?k=20prepare.sh=20(#3803)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix profiler

* add max_mem_reserved for benchmark

* fix benchmark

* Update prepare.sh

* Update prepare.sh
---
 tests/test_tipc/prepare.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh
index e57feda0..7d4dd8b1 100755
--- a/tests/test_tipc/prepare.sh
+++ b/tests/test_tipc/prepare.sh
@@ -35,6 +35,8 @@ if [[ ${MODE} = "benchmark_train" ]];then
     pip install setuptools_scm #-i https://pypi.tuna.tsinghua.edu.cn/simple 
     pip install . #-i https://pypi.tuna.tsinghua.edu.cn/simple 
     pip install jsonlines
+    pip install -U scipy==1.12.0  # 高版本数据处理部分报错
+    pip install -U matplotlib==3.7.1 # 高版本报错cannot import name 'get_cmap' from 'matplotlib.cm' 
     pip list
     cd -
     if [[ ${model_name} == "conformer" ]]; then

From 5e03da403b3c806a1cf1a736f17d2f16d4f61c51 Mon Sep 17 00:00:00 2001
From: funnycoder888 <maodezheng@yeah.net>
Date: Mon, 8 Jul 2024 15:30:56 +0800
Subject: [PATCH 09/18] Fix spelling errors (#3807)

* Fix spelling errors

* Update README.md

fix spelling error
---
 audio/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/audio/README.md b/audio/README.md
index d42d4122..a8c47efe 100644
--- a/audio/README.md
+++ b/audio/README.md
@@ -14,7 +14,7 @@ Linux test build whl environment:
 * gcc/g++ - 8.2.0
 * cmake - 3.18.0 (need install)
 
-MAC：test build whl envrioment：
+MAC：test build whl environment：
 * os 
 * gcc/g++ 12.2.0
 * cpu Intel Xeon E5 x86_64

From 748a5f9d5c36ed6f1f2c8fb67aa66a366314635b Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Fri, 19 Jul 2024 18:18:16 +0800
Subject: [PATCH 10/18] fix (#3818)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 030f7f88..941639e7 100644
--- a/setup.py
+++ b/setup.py
@@ -43,7 +43,7 @@ base = [
     # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
     "numpy==1.23.5",
     "librosa==0.8.1",
-    "scipy>=1.4.0",
+    "scipy>=1.4.0, <=1.12.0",
     "loguru",
     "matplotlib",
     "nara_wpe",

From 2e93229a9379868d2f76e1c2a113a18c5a55bece Mon Sep 17 00:00:00 2001
From: tianshuo78520a <tianshuo03@baidu.com>
Date: Tue, 23 Jul 2024 11:08:45 +0800
Subject: [PATCH 11/18] Fix (#3821)

* Fix

* Test CI Docker

* Test CI Docker
---
 tools/Dockerfile    |  4 ++++
 tools/pre_commit.sh | 54 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 tools/Dockerfile
 create mode 100644 tools/pre_commit.sh

diff --git a/tools/Dockerfile b/tools/Dockerfile
new file mode 100644
index 00000000..18596f32
--- /dev/null
+++ b/tools/Dockerfile
@@ -0,0 +1,4 @@
+FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.2-cudnn8.2-trt8.0-gcc82
+RUN apt-get update -y
+RUN apt-get -y install libsndfile1
+RUN pip3.8 install pytest-runner
diff --git a/tools/pre_commit.sh b/tools/pre_commit.sh
new file mode 100644
index 00000000..3a179782
--- /dev/null
+++ b/tools/pre_commit.sh
@@ -0,0 +1,54 @@
+set +x
+
+# use pre-commit 2.17
+if ! [[ $(pre-commit --version) == *"2.17.0"* ]]; then
+    pip install pre-commit==2.17.0 1>nul
+fi
+
+# Install clang-format before git commit to avoid repeat installation due to
+# pre-commit multi-thread running.
+readonly VERSION="13.0.0"
+version=$(clang-format -version)
+if ! [[ $(python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1$2}') -ge 36 ]]; then
+    echo "clang-format installation by pip need python version great equal 3.6,
+          please change the default python to higher version."
+    exit 1
+fi
+
+diff_files=$(git diff --name-only --diff-filter=ACMR ${BRANCH})
+num_diff_files=$(echo "$diff_files" | wc -l)
+echo -e "diff files between pr and ${BRANCH}:\n${diff_files}"
+
+echo "Checking code style by pre-commit ..."
+pre-commit run --files ${diff_files};check_error=$?
+
+if test ! -z "$(git diff)"; then
+    echo -e '\n************************************************************************************'
+    echo -e "These files have been formatted by code format hook. You should use pre-commit to \
+format them before git push."
+    echo -e '************************************************************************************\n'
+    git diff 2>&1
+fi
+
+echo -e '\n************************************************************************************'
+if [ ${check_error} != 0 ];then
+    echo "Your PR code style check failed."
+    echo "Please install pre-commit locally and set up git hook scripts:"
+    echo ""
+    echo "    pip install pre-commit==2.17.0"
+    echo "    pre-commit install"
+    echo ""
+    if [[ $num_diff_files -le 100 ]];then
+        echo "Then, run pre-commit to check codestyle issues in your PR:"
+        echo ""
+        echo "    pre-commit run --files" $(echo ${diff_files} | tr "\n" " ")
+        echo ""
+    fi
+    echo "For more information, please refer to our codestyle check guide:"
+    echo "https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/dev_guides/git_guides/codestyle_check_guide_cn.html"
+else
+    echo "Your PR code style check passed."
+fi
+echo -e '************************************************************************************\n'
+
+exit ${check_error}

From d615fc33de2f340f1b6ca81c71d08b9bfcdd9b94 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Wed, 14 Aug 2024 14:17:53 +0800
Subject: [PATCH 12/18] =?UTF-8?q?=E3=80=90ASR=E3=80=91fix=20acs=20demo=20(?=
 =?UTF-8?q?#3826)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix demo acs

* fix
---
 demos/audio_content_search/README.md           | 15 ++++++++++++---
 demos/audio_content_search/README_cn.md        | 18 ++++++++++++++----
 .../conf/ws_conformer_application.yaml         |  4 +++-
 .../ws_conformer_wenetspeech_application.yaml  |  1 +
 4 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/demos/audio_content_search/README.md b/demos/audio_content_search/README.md
index 4428bf38..f04ac447 100644
--- a/demos/audio_content_search/README.md
+++ b/demos/audio_content_search/README.md
@@ -19,7 +19,7 @@ You can choose one way from meduim and hard to install paddlespeech.
 The dependency refers to the requirements.txt, and install the dependency as follows:
 
 ```
-pip install -r requriement.txt 
+pip install -r requirements.txt 
 ```
 
 ### 2. Prepare Input File
@@ -30,11 +30,20 @@ Here are sample files for this demo that can be downloaded:
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 ```
 
-### 3. Usage
+### 3. run paddlespeech_server
+Before using the client, it is necessary to start paddlespeech_servers.
+
+Here are sample server configuration：
+```bash
+bash demos/audio_content_search/run.sh
+```
+The logs of the two services will be recorded in 'acs.log' and 'streaming_asr.log' in this configuration.
+
+### 4. Usage
 - Command Line(Recommended)
   ```bash
   # Chinese
-  paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav 
+  paddlespeech_client acs --server_ip 127.0.0.1 --port 8490 --input ./zh.wav 
   ```
   
   Usage:
diff --git a/demos/audio_content_search/README_cn.md b/demos/audio_content_search/README_cn.md
index 6f51c4cf..16c1a3dd 100644
--- a/demos/audio_content_search/README_cn.md
+++ b/demos/audio_content_search/README_cn.md
@@ -19,7 +19,7 @@
 依赖参见 requirements.txt, 安装依赖
 
 ```
-pip install -r requriement.txt 
+pip install -r requirements.txt
 ```
 
 ### 2. 准备输入
@@ -29,16 +29,26 @@ pip install -r requriement.txt
 ```bash
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 ```
-### 3. 使用方法
+
+### 3. 启动 server
+使用 client 之前需要先启动 paddlespeech_server。
+
+可以使用默认 server 配置：
+```bash
+bash demos/audio_content_search/run.sh
+```
+该配置下两个服务的日志会被记录在 `acs.log` 和 `streaming_asr.log` 中。
+
+### 4. 使用方法
 - 命令行 (推荐使用)
   ```bash
   # 中文
-  paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav 
+  paddlespeech_client acs --server_ip 127.0.0.1 --port 8490 --input ./zh.wav 
   ```
   
   使用方法：
   ```bash
-  paddlespeech acs --help
+  paddlespeech asr --help
   ```
   参数：
   - `input`(必须输入)：用于识别的音频文件。
diff --git a/demos/audio_content_search/conf/ws_conformer_application.yaml b/demos/audio_content_search/conf/ws_conformer_application.yaml
index 97201382..ad34ec9f 100644
--- a/demos/audio_content_search/conf/ws_conformer_application.yaml
+++ b/demos/audio_content_search/conf/ws_conformer_application.yaml
@@ -26,8 +26,10 @@ asr_online:
     sample_rate: 16000
     cfg_path: 
     decode_method: 'attention_rescoring' 
+    num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
     force_yes: True
     device: 'cpu' # cpu or gpu:id
+    continuous_decoding: False # disable continue decoding when endpoint detected
     am_predictor_conf:
         device:  # set 'gpu:id' or 'cpu'
         switch_ir_optim: True
@@ -40,4 +42,4 @@ asr_online:
         window_ms: 25   # ms
         shift_ms: 10    # ms
         sample_rate: 16000
-        sample_width: 2
+        sample_width: 2
\ No newline at end of file
diff --git a/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml b/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml
index c23680bd..ef1ce8d5 100644
--- a/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml
+++ b/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml
@@ -31,6 +31,7 @@ asr_online:
     force_yes: True
     device: 'cpu' # cpu or gpu:id
     decode_method: "attention_rescoring"
+    num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
     am_predictor_conf:
         device:  # set 'gpu:id' or 'cpu'
         switch_ir_optim: True

From 0b568136d9f777d2e702d2cc1f40781f6fee8312 Mon Sep 17 00:00:00 2001
From: zhuyipin <yipinzhu@outlook.com>
Date: Tue, 20 Aug 2024 16:53:25 +0800
Subject: [PATCH 13/18] speedyspeech code adapt for npu (#3804)

* speedyspeech code adapt for npu

* fix npu inference

* fix e2e synthesize

* add paddle version control for memory optim config

* fix code style

* fix code style

* fix help message

* fix code style

* fix help message
---
 examples/csmsc/tts2/local/inference_npu.sh    |  46 +++++++
 .../csmsc/tts2/local/synthesize_e2e_npu.sh    | 124 ++++++++++++++++++
 examples/csmsc/tts2/local/synthesize_npu.sh   | 110 ++++++++++++++++
 examples/csmsc/tts2/local/train_npu.sh        |  16 +++
 examples/csmsc/tts2/run_npu.sh                |  42 ++++++
 paddlespeech/t2s/exps/inference.py            |   2 +-
 paddlespeech/t2s/exps/speedyspeech/train.py   |  29 ++--
 paddlespeech/t2s/exps/syn_utils.py            |   3 +-
 paddlespeech/t2s/exps/synthesize.py           |  19 ++-
 paddlespeech/t2s/exps/synthesize_e2e.py       |  19 ++-
 10 files changed, 392 insertions(+), 18 deletions(-)
 create mode 100644 examples/csmsc/tts2/local/inference_npu.sh
 create mode 100755 examples/csmsc/tts2/local/synthesize_e2e_npu.sh
 create mode 100755 examples/csmsc/tts2/local/synthesize_npu.sh
 create mode 100755 examples/csmsc/tts2/local/train_npu.sh
 create mode 100644 examples/csmsc/tts2/run_npu.sh

diff --git a/examples/csmsc/tts2/local/inference_npu.sh b/examples/csmsc/tts2/local/inference_npu.sh
new file mode 100644
index 00000000..0746a0cd
--- /dev/null
+++ b/examples/csmsc/tts2/local/inference_npu.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=pwgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device npu
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=mb_melgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device npu
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device npu
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_e2e_npu.sh b/examples/csmsc/tts2/local/synthesize_e2e_npu.sh
new file mode 100755
index 00000000..1209a532
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_e2e_npu.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nnpu=1
+
+
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+        # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nnpu=1
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_npu.sh b/examples/csmsc/tts2/local/synthesize_npu.sh
new file mode 100755
index 00000000..90fcef83
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_npu.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# style melgan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in hifigan syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --tones_dict=dump/tone_id_map.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
diff --git a/examples/csmsc/tts2/local/train_npu.sh b/examples/csmsc/tts2/local/train_npu.sh
new file mode 100755
index 00000000..46243e15
--- /dev/null
+++ b/examples/csmsc/tts2/local/train_npu.sh
@@ -0,0 +1,16 @@
+
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=0 \
+    --nnpu=1 \
+    --phones-dict=dump/phone_id_map.txt \
+    --tones-dict=dump/tone_id_map.txt \
+    --use-relative-path=True
diff --git a/examples/csmsc/tts2/run_npu.sh b/examples/csmsc/tts2/run_npu.sh
new file mode 100644
index 00000000..f36c93f7
--- /dev/null
+++ b/examples/csmsc/tts2/run_npu.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+npus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_76.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run_xpu.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    FLAGS_selected_npus=${npus} ./local/train_npu.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    FLAGS_selected_npus=${npus} ./local/synthesize_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    FLAGS_selected_npus=${npus} ./local/synthesize_e2e_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model
+    FLAGS_selected_npus=${npus} ./local/inference_npu.sh ${train_output_path} || exit -1
+fi
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index 8a526982..21d105ad 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -112,7 +112,7 @@ def parse_args():
     parser.add_argument(
         "--device",
         default="gpu",
-        choices=["gpu", "cpu", "xpu"],
+        choices=["gpu", "cpu", "xpu", "npu"],
         help="Device selected for inference.", )
     parser.add_argument('--cpu_threads', type=int, default=1)
 
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index c90090da..b82d6880 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -45,15 +45,18 @@ def train_sp(args, config):
     # decides device type and whether to run in parallel
     # setup running environment correctly
     world_size = paddle.distributed.get_world_size()
-    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
-        if (not paddle.is_compiled_with_xpu()) or args.nxpu == 0:
-            paddle.set_device("cpu")
-        else:
-            paddle.set_device("xpu")
-    else:
+    if paddle.is_compiled_with_cuda() and args.ngpu > 0:
         paddle.set_device("gpu")
         if world_size > 1:
             paddle.distributed.init_parallel_env()
+    elif paddle.is_compiled_with_xpu() and args.nxpu > 0:
+        paddle.device.set_device("xpu")
+    elif args.nnpu > 0:
+        paddle.device.set_device("npu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+    else:
+        paddle.set_device("cpu")
 
     # set the random seed, it is a must for multiprocess training
     seed_everything(config.seed)
@@ -191,9 +194,19 @@ def main():
         "--nxpu",
         type=int,
         default=0,
-        help="if nxpu == 0 and ngpu == 0, use cpu.")
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+    )
+    parser.add_argument(
+        "--nnpu",
+        type=int,
+        default=0,
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
+    )
     parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu")
+        "--ngpu",
+        type=int,
+        default=1,
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
 
     parser.add_argument(
         "--use-relative-path",
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 9a07df64..d29dd811 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -591,7 +591,8 @@ def get_predictor(
 
     config = inference.Config(
         str(Path(model_dir) / model_file), str(Path(model_dir) / params_file))
-    config.enable_memory_optim()
+    if paddle.__version__ <= "2.5.2" and paddle.__version__ != "0.0.0":
+        config.enable_memory_optim()
     config.switch_ir_optim(True)
     if device == "gpu":
         config.enable_use_gpu(100, device_id)
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index e7cf7850..9eb45989 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -219,12 +219,21 @@ def parse_args():
     )
     # other
     parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
+        "--ngpu",
+        type=int,
+        default=1,
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
     parser.add_argument(
         "--nxpu",
         type=int,
         default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+    )
+    parser.add_argument(
+        "--nnpu",
+        type=int,
+        default=0,
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
     )
     parser.add_argument("--test_metadata", type=str, help="test metadata.")
     parser.add_argument("--output_dir", type=str, help="output dir.")
@@ -245,10 +254,12 @@ def main():
         paddle.set_device("gpu")
     elif args.nxpu > 0:
         paddle.set_device("xpu")
-    elif args.ngpu == 0 and args.nxpu == 0:
+    elif args.nnpu > 0:
+        paddle.set_device("npu")
+    elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0:
         paddle.set_device("cpu")
     else:
-        print("ngpu or nxpu should >= 0 !")
+        print("ngpu, nxpu and nnpu should be >= 0")
 
     evaluate(args)
 
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index c63a5fbe..b9073124 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -299,12 +299,21 @@ def parse_args():
         default=None,
         help="dir to save inference models")
     parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
+        "--ngpu",
+        type=int,
+        default=1,
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
     parser.add_argument(
         "--nxpu",
         type=int,
         default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+    )
+    parser.add_argument(
+        "--nnpu",
+        type=int,
+        default=0,
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
     )
     parser.add_argument(
         "--text",
@@ -339,10 +348,12 @@ def main():
         paddle.set_device("gpu")
     elif args.nxpu > 0:
         paddle.set_device("xpu")
-    elif args.ngpu == 0 and args.nxpu == 0:
+    elif args.nnpu > 0:
+        paddle.set_device("npu")
+    elif args.ngpu == 0 and args.nxpu == 0 or args.nnpu == 0:
         paddle.set_device("cpu")
     else:
-        print("ngpu or nxpu should >= 0 !")
+        print("ngpu, nxpu and nnpu should be >= 0")
 
     evaluate(args)
 

From 4be005858b75c380a6a7b614108bafd1db8dddd6 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Thu, 22 Aug 2024 11:18:29 +0800
Subject: [PATCH 14/18] =?UTF-8?q?=E3=80=90DOC=E3=80=91fix=20demos=20bug=20?=
 =?UTF-8?q?(#3830)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix demos

* fix test
---
 demos/audio_searching/requirements.txt                      | 6 +++---
 demos/streaming_asr_server/README.md                        | 4 ++--
 demos/streaming_asr_server/README_cn.md                     | 4 ++--
 demos/style_fs2/run.sh                                      | 2 +-
 .../unit/server/online/tts/test_server/test_http_client.py  | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/demos/audio_searching/requirements.txt b/demos/audio_searching/requirements.txt
index 9d0f6419..3c0f05af 100644
--- a/demos/audio_searching/requirements.txt
+++ b/demos/audio_searching/requirements.txt
@@ -1,5 +1,5 @@
-diskcache==5.2.1
-dtaidistance==2.3.1
+diskcache
+dtaidistane
 fastapi
 librosa==0.8.0
 numpy==1.22.0
@@ -10,4 +10,4 @@ python-multipart
 soundfile==0.10.3.post1
 starlette
 typing
-uvicorn
\ No newline at end of file
+uvicorn
diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md
index 31256d15..136863b9 100644
--- a/demos/streaming_asr_server/README.md
+++ b/demos/streaming_asr_server/README.md
@@ -429,7 +429,7 @@ bash server.sh
   If `127.0.0.1` is not accessible, you need to use the actual service IP address.
 
   ```bash
-  paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
+  paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
   ```
   Output:
   ```text
@@ -507,7 +507,7 @@ bash server.sh
   If `127.0.0.1` is not accessible, you need to use the actual service IP address.
 
   ```bash
-  python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
+  python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
   ```
   Output:
   ```text
diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md
index bbddd693..f5f477ea 100644
--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -428,7 +428,7 @@ bash server.sh
   若 `127.0.0.1` 不能访问，则需要使用实际服务 IP 地址
 
   ```bash
-  paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
+  paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
   ```
   输出:
   ```text
@@ -506,7 +506,7 @@ bash server.sh
   若 `127.0.0.1` 不能访问，则需要使用实际服务 IP 地址
 
   ```bash
-  python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
+  python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
   ```
   输出:
   ```text
diff --git a/demos/style_fs2/run.sh b/demos/style_fs2/run.sh
index 6f6d6068..45fc0c10 100755
--- a/demos/style_fs2/run.sh
+++ b/demos/style_fs2/run.sh
@@ -32,7 +32,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
         --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
         --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=./sentences.txt \
         --output-dir=output \
         --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
 fi
diff --git a/tests/unit/server/online/tts/test_server/test_http_client.py b/tests/unit/server/online/tts/test_server/test_http_client.py
index 3174e85e..685c5ca9 100644
--- a/tests/unit/server/online/tts/test_server/test_http_client.py
+++ b/tests/unit/server/online/tts/test_server/test_http_client.py
@@ -48,7 +48,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--text",
         type=str,
-        default="../../../../../../paddlespeech/t2s/exps/csmsc_test.txt",
+        default="../../../../../../paddlespeech/t2s/assets/csmsc_test.txt",
         help="text to synthesize, a 'utt_id sentence' pair per line")
     parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
     parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')

From a9ece28ba63bc0841ef5488b3dba46e8d5aa180b Mon Sep 17 00:00:00 2001
From: zhuyipin <yipinzhu@outlook.com>
Date: Thu, 29 Aug 2024 10:56:01 +0800
Subject: [PATCH 15/18] speedyspeech code adapt for mlu (#3828)

* speedyspeech code adapt for mlu

* fix inference

* fix help message
---
 examples/csmsc/tts2/local/inference_mlu.sh    | 33 +++++++
 .../csmsc/tts2/local/synthesize_e2e_mlu.sh    | 99 +++++++++++++++++++
 examples/csmsc/tts2/local/synthesize_mlu.sh   | 90 +++++++++++++++++
 examples/csmsc/tts2/local/train_mlu.sh        | 16 +++
 examples/csmsc/tts2/run_mlu.sh                | 76 ++++++++++++++
 paddlespeech/t2s/exps/inference.py            |  2 +-
 paddlespeech/t2s/exps/speedyspeech/train.py   | 12 ++-
 paddlespeech/t2s/exps/synthesize.py           | 21 +++-
 paddlespeech/t2s/exps/synthesize_e2e.py       | 21 +++-
 9 files changed, 357 insertions(+), 13 deletions(-)
 create mode 100755 examples/csmsc/tts2/local/inference_mlu.sh
 create mode 100755 examples/csmsc/tts2/local/synthesize_e2e_mlu.sh
 create mode 100755 examples/csmsc/tts2/local/synthesize_mlu.sh
 create mode 100755 examples/csmsc/tts2/local/train_mlu.sh
 create mode 100755 examples/csmsc/tts2/run_mlu.sh

diff --git a/examples/csmsc/tts2/local/inference_mlu.sh b/examples/csmsc/tts2/local/inference_mlu.sh
new file mode 100755
index 00000000..d1bade84
--- /dev/null
+++ b/examples/csmsc/tts2/local/inference_mlu.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=mb_melgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device mlu
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device mlu
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh b/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh
new file mode 100755
index 00000000..7ad2024f
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+        # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# wavernn
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in wavernn syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nmlu=1
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_mlu.sh b/examples/csmsc/tts2/local/synthesize_mlu.sh
new file mode 100755
index 00000000..6c0b0b65
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_mlu.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# style melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "in hifigan syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# wavernn
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in wavernn syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --tones_dict=dump/tone_id_map.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
diff --git a/examples/csmsc/tts2/local/train_mlu.sh b/examples/csmsc/tts2/local/train_mlu.sh
new file mode 100755
index 00000000..4c148643
--- /dev/null
+++ b/examples/csmsc/tts2/local/train_mlu.sh
@@ -0,0 +1,16 @@
+
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+# export MLU_VISIBLE_DEVICES=8
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=0 \
+    --nmlu=2 \
+    --phones-dict=dump/phone_id_map.txt \
+    --tones-dict=dump/tone_id_map.txt \
+    --use-relative-path=True
diff --git a/examples/csmsc/tts2/run_mlu.sh b/examples/csmsc/tts2/run_mlu.sh
new file mode 100755
index 00000000..848e5407
--- /dev/null
+++ b/examples/csmsc/tts2/run_mlu.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+set -e
+source path.sh
+export CUSTOM_DEVICE_BLACK_LIST=elementwise_max
+mlus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_30600.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    FLAGS_selected_mlus=${mlus} ./local/train_mlu.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    FLAGS_selected_mlus=${mlus} ./local/synthesize_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    FLAGS_selected_mlus=${mlus} ./local/synthesize_e2e_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model
+    FLAGS_selected_mlus=${mlus} ./local/inference_mlu.sh ${train_output_path} || exit -1
+fi
+
+# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
+# we have only tested the following models so far
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # install paddle2onnx
+    pip install paddle2onnx --upgrade
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
+    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+fi
+
+# inference with onnxruntime
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    ./local/ort_predict.sh ${train_output_path}
+fi
+
+# must run after stage 3 (which stage generated static models)
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    ./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86
+    ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86
+    # ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86
+    # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86
+fi
+
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
+fi
+
+# PTQ_static
+if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} speedyspeech_csmsc || exit -1
+fi
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index 21d105ad..e8ddd3be 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -112,7 +112,7 @@ def parse_args():
     parser.add_argument(
         "--device",
         default="gpu",
-        choices=["gpu", "cpu", "xpu", "npu"],
+        choices=["gpu", "cpu", "xpu", "npu", "mlu"],
         help="Device selected for inference.", )
     parser.add_argument('--cpu_threads', type=int, default=1)
 
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index b82d6880..b1916fbc 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -55,6 +55,8 @@ def train_sp(args, config):
         paddle.device.set_device("npu")
         if world_size > 1:
             paddle.distributed.init_parallel_env()
+    elif args.nmlu > 0:
+        paddle.device.set_device("mlu")
     else:
         paddle.set_device("cpu")
 
@@ -194,13 +196,19 @@ def main():
         "--nxpu",
         type=int,
         default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
     )
     parser.add_argument(
         "--nnpu",
         type=int,
         default=0,
-        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
+    )
+    parser.add_argument(
+        "--nmlu",
+        type=int,
+        default=1,
+        help="if wish to use npu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
     )
     parser.add_argument(
         "--ngpu",
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index 9eb45989..b159725e 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -222,18 +222,25 @@ def parse_args():
         "--ngpu",
         type=int,
         default=1,
-        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
+    )
     parser.add_argument(
         "--nxpu",
         type=int,
         default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
     )
     parser.add_argument(
         "--nnpu",
         type=int,
         default=0,
-        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
+    )
+    parser.add_argument(
+        "--nmlu",
+        type=int,
+        default=0,
+        help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
     )
     parser.add_argument("--test_metadata", type=str, help="test metadata.")
     parser.add_argument("--output_dir", type=str, help="output dir.")
@@ -256,10 +263,14 @@ def main():
         paddle.set_device("xpu")
     elif args.nnpu > 0:
         paddle.set_device("npu")
-    elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0:
+    elif args.nmlu > 0:
+        paddle.set_device("mlu")
+    elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
         paddle.set_device("cpu")
     else:
-        print("ngpu, nxpu and nnpu should be >= 0")
+        print(
+            "one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
+        )
 
     evaluate(args)
 
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index b9073124..08a14b31 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -302,18 +302,25 @@ def parse_args():
         "--ngpu",
         type=int,
         default=1,
-        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
+    )
     parser.add_argument(
         "--nxpu",
         type=int,
         default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
     )
     parser.add_argument(
         "--nnpu",
         type=int,
         default=0,
-        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
+    )
+    parser.add_argument(
+        "--nmlu",
+        type=int,
+        default=0,
+        help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
     )
     parser.add_argument(
         "--text",
@@ -350,10 +357,14 @@ def main():
         paddle.set_device("xpu")
     elif args.nnpu > 0:
         paddle.set_device("npu")
-    elif args.ngpu == 0 and args.nxpu == 0 or args.nnpu == 0:
+    elif args.nmlu > 0:
+        paddle.set_device("mlu")
+    elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
         paddle.set_device("cpu")
     else:
-        print("ngpu, nxpu and nnpu should be >= 0")
+        print(
+            "one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
+        )
 
     evaluate(args)
 

From d9eb82a6324bdc3ab7bfd9d38ced92ae7e9693c5 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Thu, 29 Aug 2024 19:35:26 +0800
Subject: [PATCH 16/18] fix unit test (#3835)

---
 tests/unit/asr/deepspeech2_model_test.py      | 10 ++++----
 .../unit/asr/deepspeech2_online_model_test.py | 24 +++++++++----------
 .../unit/server/offline/test_server_client.sh |  2 ++
 tests/unit/tts/test_data_table.py             |  2 +-
 4 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/tests/unit/asr/deepspeech2_model_test.py b/tests/unit/asr/deepspeech2_model_test.py
index 5835445d..fd42192e 100644
--- a/tests/unit/asr/deepspeech2_model_test.py
+++ b/tests/unit/asr/deepspeech2_model_test.py
@@ -48,7 +48,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=False,
-            share_rnn_weights=False, )
+            rnn_direction="forward", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
@@ -60,7 +60,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=True,
-            share_rnn_weights=False, )
+            rnn_direction="forward", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
@@ -72,7 +72,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=False,
-            share_rnn_weights=True, )
+            rnn_direction="bidirect", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
@@ -84,7 +84,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=True,
-            share_rnn_weights=True, )
+            rnn_direction="bidirect", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
@@ -96,7 +96,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=False,
-            share_rnn_weights=False, )
+            rnn_direction="forward", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
diff --git a/tests/unit/asr/deepspeech2_online_model_test.py b/tests/unit/asr/deepspeech2_online_model_test.py
index f23c4926..f7ea87b1 100644
--- a/tests/unit/asr/deepspeech2_online_model_test.py
+++ b/tests/unit/asr/deepspeech2_online_model_test.py
@@ -19,11 +19,11 @@ import numpy as np
 import paddle
 from paddle import inference
 
-from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline
-from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
+from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
+from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
 
 
-class TestDeepSpeech2ModelOnline(unittest.TestCase):
+class TestDeepSpeech2Model(unittest.TestCase):
     def setUp(self):
         paddle.set_device('cpu')
 
@@ -45,7 +45,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.text_len = paddle.to_tensor(text_len, dtype='int64')
 
     def test_ds2_1(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -58,7 +58,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_2(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -71,7 +71,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_3(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -84,7 +84,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_4(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -97,7 +97,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_5(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -110,7 +110,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_6(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -125,7 +125,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
 
     def test_ds2_7(self):
         use_gru = False
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -156,7 +156,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
 
     def test_ds2_8(self):
         use_gru = True
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -191,7 +191,7 @@ class TestDeepSpeech2StaticModelOnline(unittest.TestCase):
         export_prefix = "exp/deepspeech2_online/checkpoints/test_export"
         if not os.path.exists(os.path.dirname(export_prefix)):
             os.makedirs(os.path.dirname(export_prefix), mode=0o755)
-        infer_model = DeepSpeech2InferModelOnline(
+        infer_model = DeepSpeech2InferModel(
             feat_size=161,
             dict_size=4233,
             num_conv_layers=2,
diff --git a/tests/unit/server/offline/test_server_client.sh b/tests/unit/server/offline/test_server_client.sh
index dc52609c..29bdd403 100644
--- a/tests/unit/server/offline/test_server_client.sh
+++ b/tests/unit/server/offline/test_server_client.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 # bash test_server_client.sh
+## require lsof to get server pid
+## apt-get install -y lsof
 
 StartService(){
     # Start service 
diff --git a/tests/unit/tts/test_data_table.py b/tests/unit/tts/test_data_table.py
index 3ff5bc1a..773942a2 100644
--- a/tests/unit/tts/test_data_table.py
+++ b/tests/unit/tts/test_data_table.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddlespeech.t2s.datasets.data_tabel import DataTable
+from paddlespeech.t2s.datasets.data_table import DataTable
 
 
 def test_audio_dataset():

From 7e52aaed74f87b02af6d03098ff9f65e3224f5ce Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Fri, 30 Aug 2024 13:09:29 +0800
Subject: [PATCH 17/18] Add tests (#3836)

* Add tests

* fix

* Fix

* Fix

* disable deepspeech2_online_model_test

* disable test_data_table

* Fix
---
 tests/unit/ci.sh | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 tests/unit/ci.sh

diff --git a/tests/unit/ci.sh b/tests/unit/ci.sh
new file mode 100644
index 00000000..9342a268
--- /dev/null
+++ b/tests/unit/ci.sh
@@ -0,0 +1,31 @@
+function main(){
+  set -ex
+  speech_ci_path=`pwd`
+
+  echo "Start asr"
+  cd ${speech_ci_path}/asr
+  bash deepspeech2_online_model_test.sh
+  python error_rate_test.py
+  python mask_test.py
+  python reverse_pad_list.py
+  echo "End asr"
+
+  echo "Start TTS"
+  cd ${speech_ci_path}/tts
+  python test_data_table.py
+  python test_enfrontend.py
+  python test_mixfrontend.py
+  echo "End TTS"
+
+  echo "Start Vector"
+  cd ${speech_ci_path}/vector
+  python test_augment.py
+  echo "End Vector"
+
+  echo "Start cli"
+  cd ${speech_ci_path}/cli
+  bash test_cli.sh
+  echo "End cli"
+}
+
+main

From f66d7d25c40987bf4262ae7f17b442b0d7d4f356 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Thu, 5 Sep 2024 17:05:29 +0800
Subject: [PATCH 18/18] fix matplotlib version for incompatible upgrade (#3841)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 941639e7..48c68485 100644
--- a/setup.py
+++ b/setup.py
@@ -45,7 +45,7 @@ base = [
     "librosa==0.8.1",
     "scipy>=1.4.0, <=1.12.0",
     "loguru",
-    "matplotlib",
+    "matplotlib<=3.8.4",
     "nara_wpe",
     "onnxruntime>=1.11.0",
     "opencc==1.1.6",