diff --git a/.github/stale.yml b/.github/stale.yml
index da19b6606..6b0da9b98 100644
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -6,7 +6,8 @@ daysUntilClose: 30
 exemptLabels:
   - Roadmap 
   - Bug
-  - New Feature 
+  - feature request
+  - Tips
 # Label to use when marking an issue as stale
 staleLabel: Stale
 # Comment to post when marking an issue as stale. Set to `false` to disable
@@ -17,4 +18,4 @@ markComment: >
 unmarkComment: false
 # Comment to post when closing a stale issue. Set to `false` to disable
 closeComment: >
-  This issue is closed. Please re-open if needed.
\ No newline at end of file
+  This issue is closed. Please re-open if needed.
diff --git a/audio/setup.py b/audio/setup.py
index d36b2c440..823e5dfad 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -40,14 +40,9 @@ COMMITID = 'none'
 base = [
     "kaldiio",
     "librosa==0.8.1",
-    "scipy>=1.0.0",
-    "soundfile~=0.10",
-    "colorlog",
-    "pathos==0.2.8",
+    "pathos",
     "pybind11",
     "parameterized",
-    "tqdm",
-    "scikit-learn"
 ]
 
 requirements = {
@@ -273,7 +268,7 @@ def main():
         },
 
         # Package info
-        packages=find_packages(include=('paddleaudio*')),
+        packages=find_packages(include=['paddleaudio*']),
         package_data=lib_package_data,
         ext_modules=setup_helpers.get_ext_modules(),
         zip_safe=True,
diff --git a/demos/speech_web/speech_server/requirements.txt b/demos/speech_web/speech_server/requirements.txt
index cdc654656..8425a1fee 100644
--- a/demos/speech_web/speech_server/requirements.txt
+++ b/demos/speech_web/speech_server/requirements.txt
@@ -1,8 +1,6 @@
 aiofiles
 faiss-cpu
-praatio==5.0.0
+praatio>=5.0.0
 pydantic
 python-multipart
-scikit_learn
 starlette
-uvicorn
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 5422c26f9..609f27925 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,11 +1,9 @@
 braceexpand
 editdistance
-fastapi
 g2p_en
 g2pM
 h5py
 inflect
-jieba
 jsonlines
 kaldiio
 keyboard
@@ -24,30 +22,23 @@ paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
 pattern_singleton
-Pillow>=9.0.0
 ppdiffusers>=0.9.0
-praatio==5.0.0
+praatio>=5.0.0
 prettytable
 pypinyin-dict
 pypinyin<=0.44.0
 python-dateutil
-pyworld==0.2.12
+pyworld>=0.2.12
 recommonmark>=0.5.0
-resampy==0.2.2
+resampy
 sacrebleu
-scipy
-sentencepiece~=0.1.96
-soundfile~=0.10
 sphinx
 sphinx-autobuild
 sphinx-markdown-tables
 sphinx_rtd_theme
 textgrid
 timer
-tqdm
 typeguard
-uvicorn
-visualdl
 webrtcvad
 websockets
 yacs~=0.1.8
diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md
index 8f223e07b..50d703b2d 100644
--- a/examples/csmsc/vits/README.md
+++ b/examples/csmsc/vits/README.md
@@ -147,14 +147,14 @@ optional arguments:
 
 The pretrained model can be downloaded here:
 
-- [vits_csmsc_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.1.0.zip) (add_blank=true)
+- [vits_csmsc_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.4.0.zip) (add_blank=true)
 
 VITS checkpoint contains files listed below.
 ```text
-vits_csmsc_ckpt_1.1.0
-├── default.yaml              # default config used to train vitx
-├── phone_id_map.txt          # phone vocabulary file when training vits
-└── snapshot_iter_333000.pdz  # model parameters and optimizer states
+vits_csmsc_ckpt_1.4.0
+├── default.yaml                    # default config used to train vitx
+├── phone_id_map.txt                # phone vocabulary file when training vits
+└── snapshot_iter_150000.pdz  # model parameters and optimizer states
 ```
 
 ps: This ckpt is not good enough, a better result is training
@@ -168,9 +168,9 @@ add_blank=true
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/synthesize_e2e.py \
-    --config=vits_csmsc_ckpt_1.1.0/default.yaml \
-    --ckpt=vits_csmsc_ckpt_1.1.0/snapshot_iter_333000.pdz \
-    --phones_dict=vits_csmsc_ckpt_1.1.0/phone_id_map.txt \
+    --config=vits_csmsc_ckpt_1.4.0/default.yaml \
+    --ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \
+    --phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \
     --output_dir=exp/default/test_e2e \
     --text=${BIN_DIR}/../sentences.txt \
     --add-blank=${add_blank} 
diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py
index 25382d8c3..f023a37b7 100644
--- a/examples/tess/cls0/local/train.py
+++ b/examples/tess/cls0/local/train.py
@@ -121,7 +121,7 @@ if __name__ == "__main__":
             optimizer.clear_grad()
 
             # Calculate loss
-            avg_loss += loss.numpy()[0]
+            avg_loss += float(loss)
 
             # Calculate metrics
             preds = paddle.argmax(logits, axis=1)
diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py
index ff822f674..bd76a13d0 100644
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@@ -127,7 +127,7 @@ class TextExecutor(BaseExecutor):
         if self.task == 'punc':
             # punc list
             self._punc_list = []
-            with open(self.vocab_file, 'r') as f:
+            with open(self.vocab_file, 'r', encoding='utf-8') as f:
                 for line in f:
                     self._punc_list.append(line.strip())
 
@@ -178,12 +178,12 @@ class TextExecutor(BaseExecutor):
         if self.task == 'punc':
             # punc list
             self._punc_list = []
-            with open(self.vocab_file, 'r') as f:
+            with open(self.vocab_file, 'r', encoding='utf-8') as f:
                 for line in f:
                     self._punc_list.append(line.strip())
 
             # model
-            with open(self.cfg_path) as f:
+            with open(self.cfg_path, 'r', encoding='utf-8') as f:
                 config = CfgNode(yaml.safe_load(f))
             self.model = ErnieLinear(**config["model"])
 
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 707518c05..5515ade26 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -292,19 +292,19 @@ class TTSExecutor(BaseExecutor):
         with open(self.voc_config) as f:
             self.voc_config = CfgNode(yaml.safe_load(f))
 
-        with open(self.phones_dict, "r") as f:
+        with open(self.phones_dict, 'rt', encoding='utf-8') as f:
             phn_id = [line.strip().split() for line in f.readlines()]
         vocab_size = len(phn_id)
 
         tone_size = None
         if self.tones_dict:
-            with open(self.tones_dict, "r") as f:
+            with open(self.tones_dict, 'rt', encoding='utf-8') as f:
                 tone_id = [line.strip().split() for line in f.readlines()]
             tone_size = len(tone_id)
 
         spk_num = None
         if self.speaker_dict:
-            with open(self.speaker_dict, 'rt') as f:
+            with open(self.speaker_dict, 'rt', encoding='utf-8') as f:
                 spk_id = [line.strip().split() for line in f.readlines()]
             spk_num = len(spk_id)
 
diff --git a/paddlespeech/cli/whisper/infer.py b/paddlespeech/cli/whisper/infer.py
index c016b453a..ebcca890b 100644
--- a/paddlespeech/cli/whisper/infer.py
+++ b/paddlespeech/cli/whisper/infer.py
@@ -152,8 +152,7 @@ class WhisperExecutor(BaseExecutor):
         Init model and other resources from a specific path.
         """
         logger.debug("start to init the model")
-        # default max_len: unit:second
-        self.max_len = 50
+
         if hasattr(self, 'model'):
             logger.debug('Model had been initialized.')
             return
@@ -339,12 +338,6 @@ class WhisperExecutor(BaseExecutor):
         try:
             audio, audio_sample_rate = soundfile.read(
                 audio_file, dtype="int16", always_2d=True)
-            audio_duration = audio.shape[0] / audio_sample_rate
-            if audio_duration > self.max_len:
-                logger.error(
-                    f"Please input audio file less then {self.max_len} seconds.\n"
-                )
-                return False
         except Exception as e:
             logger.exception(e)
             logger.error(
diff --git a/paddlespeech/s2t/models/whisper/__init__.py b/paddlespeech/s2t/models/whisper/__init__.py
index 98ab23610..b78dece8a 100644
--- a/paddlespeech/s2t/models/whisper/__init__.py
+++ b/paddlespeech/s2t/models/whisper/__init__.py
@@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py)
 from paddlespeech.s2t.models.whisper.whipser import decode
diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py
index 1e1aea044..e8b201bcc 100644
--- a/paddlespeech/s2t/models/whisper/tokenizer.py
+++ b/paddlespeech/s2t/models/whisper/tokenizer.py
@@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/tokenizer.py)
 import os
diff --git a/paddlespeech/s2t/models/whisper/utils.py b/paddlespeech/s2t/models/whisper/utils.py
index d067af7d2..5528f9604 100644
--- a/paddlespeech/s2t/models/whisper/utils.py
+++ b/paddlespeech/s2t/models/whisper/utils.py
@@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/utils.py)
 import zlib
diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py
index 9cf9a9eca..a28013e4b 100644
--- a/paddlespeech/s2t/models/whisper/whipser.py
+++ b/paddlespeech/s2t/models/whisper/whipser.py
@@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper)
 import os
diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
index e450aa1a0..c43dafb3c 100644
--- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
@@ -437,7 +437,7 @@ if __name__ == '__main__':
 
     vocab_phones = {}
 
-    with open(args.phones_dict, 'rt') as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     for phn, id in phn_id:
         vocab_phones[phn] = int(id)
diff --git a/paddlespeech/t2s/exps/ernie_sat/train.py b/paddlespeech/t2s/exps/ernie_sat/train.py
index 75a666bb1..c98d691be 100644
--- a/paddlespeech/t2s/exps/ernie_sat/train.py
+++ b/paddlespeech/t2s/exps/ernie_sat/train.py
@@ -109,7 +109,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py
index d31e62a82..97626db0b 100644
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@@ -67,7 +67,7 @@ def train_sp(args, config):
     if args.speaker_dict is not None:
         print("multiple speaker fastspeech2!")
         collate_fn = fastspeech2_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
         fields += ["spk_id"]
@@ -123,7 +123,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
index 644ec250d..d05dfafcf 100644
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
@@ -39,18 +39,18 @@ def evaluate(args, speedyspeech_config, pwg_config):
 
     # construct dataset for evaluation
     sentences = []
-    with open(args.text, 'rt') as f:
+    with open(args.text, 'rt', encoding='utf-8') as f:
         for line in f:
             items = line.strip().split()
             utt_id = items[0]
             sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
-    with open(args.tones_dict, "r") as f:
+    with open(args.tones_dict, 'rt', encoding='utf-8') as f:
         tone_id = [line.strip().split() for line in f.readlines()]
     tone_size = len(tone_id)
     print("tone_size:", tone_size)
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index 7b422e64f..c90090daa 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -70,7 +70,7 @@ def train_sp(args, config):
     if args.speaker_dict is not None:
         print("multiple speaker speedyspeech!")
         collate_fn = speedyspeech_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
         fields += ["spk_id"]
@@ -133,11 +133,11 @@ def train_sp(args, config):
         collate_fn=collate_fn,
         num_workers=config.num_workers)
     print("dataloaders done!")
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
-    with open(args.tones_dict, "r") as f:
+    with open(args.tones_dict, 'rt', encoding='utf-8') as f:
         tone_id = [line.strip().split() for line in f.readlines()]
     tone_size = len(tone_id)
     print("tone_size:", tone_size)
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 6b693440c..491edda30 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -106,7 +106,7 @@ def get_chunks(data, block_size: int, pad_size: int):
 def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
     # construct dataset for evaluation
     sentences = []
-    with open(text_file, 'rt') as f:
+    with open(text_file, 'rt', encoding='utf-8') as f:
         for line in f:
             if line.strip() != "":
                 items = re.split(r"\s+", line.strip(), 1)
@@ -325,17 +325,17 @@ def get_am_inference(am: str='fastspeech2_csmsc',
                      tones_dict: Optional[os.PathLike]=None,
                      speaker_dict: Optional[os.PathLike]=None,
                      return_am: bool=False):
-    with open(phones_dict, "r") as f:
+    with open(phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     tone_size = None
     if tones_dict is not None:
-        with open(tones_dict, "r") as f:
+        with open(tones_dict, 'rt', encoding='utf-8') as f:
             tone_id = [line.strip().split() for line in f.readlines()]
         tone_size = len(tone_id)
     spk_num = None
     if speaker_dict is not None:
-        with open(speaker_dict, 'rt') as f:
+        with open(speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
     odim = am_config.n_mels
diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py
index 69ff80e46..db88009a8 100644
--- a/paddlespeech/t2s/exps/tacotron2/train.py
+++ b/paddlespeech/t2s/exps/tacotron2/train.py
@@ -119,7 +119,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py
index da48b6b99..d49baad99 100644
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -114,7 +114,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index f6a31ced2..0e74bf631 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -78,7 +78,7 @@ def train_sp(args, config):
     if args.speaker_dict is not None:
         print("multiple speaker vits!")
         collate_fn = vits_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
         fields += ["spk_id"]
@@ -132,7 +132,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py
index 261db80a8..af86d9b80 100644
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -58,7 +58,7 @@ class English(Phonetics):
         self.punc = "：，；。？！“”‘’':,;.?!"
         self.text_normalizer = TextNormalizer()
         if phone_vocab_path:
-            with open(phone_vocab_path, 'rt') as f:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
                 phn_id = [line.strip().split() for line in f.readlines()]
             for phn, id in phn_id:
                 self.vocab_phones[phn] = int(id)
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index efb673e36..35b97a93a 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -144,12 +144,12 @@ class Frontend():
         self.vocab_phones = {}
         self.vocab_tones = {}
         if phone_vocab_path:
-            with open(phone_vocab_path, 'rt') as f:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
                 phn_id = [line.strip().split() for line in f.readlines()]
             for phn, id in phn_id:
                 self.vocab_phones[phn] = int(id)
         if tone_vocab_path:
-            with open(tone_vocab_path, 'rt') as f:
+            with open(tone_vocab_path, 'rt', encoding='utf-8') as f:
                 tone_id = [line.strip().split() for line in f.readlines()]
             for tone, id in tone_id:
                 self.vocab_tones[tone] = int(id)
diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
index 09e6827d0..1db9248ae 100644
--- a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
@@ -113,16 +113,18 @@ class Tacotron2Updater(StandardUpdater):
         loss.backward()
         optimizer.step()
 
+        if self.use_guided_attn_loss:
+            report("train/attn_loss", float(attn_loss))
+            losses_dict["attn_loss"] = float(attn_loss)
+        
         report("train/l1_loss", float(l1_loss))
         report("train/mse_loss", float(mse_loss))
         report("train/bce_loss", float(bce_loss))
-        report("train/attn_loss", float(attn_loss))
         report("train/loss", float(loss))
 
         losses_dict["l1_loss"] = float(l1_loss)
         losses_dict["mse_loss"] = float(mse_loss)
         losses_dict["bce_loss"] = float(bce_loss)
-        losses_dict["attn_loss"] = float(attn_loss)
         losses_dict["loss"] = float(loss)
         self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                               for k, v in losses_dict.items())
@@ -202,17 +204,19 @@ class Tacotron2Evaluator(StandardEvaluator):
             attn_loss = self.attn_loss(
                 att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
             loss = loss + attn_loss
+        
+        if self.use_guided_attn_loss:
+            report("eval/attn_loss", float(attn_loss))
+            losses_dict["attn_loss"] = float(attn_loss)
 
         report("eval/l1_loss", float(l1_loss))
         report("eval/mse_loss", float(mse_loss))
         report("eval/bce_loss", float(bce_loss))
-        report("eval/attn_loss", float(attn_loss))
         report("eval/loss", float(loss))
 
         losses_dict["l1_loss"] = float(l1_loss)
         losses_dict["mse_loss"] = float(mse_loss)
         losses_dict["bce_loss"] = float(bce_loss)
-        losses_dict["attn_loss"] = float(attn_loss)
         losses_dict["loss"] = float(loss)
         self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                               for k, v in losses_dict.items())
diff --git a/setup.py b/setup.py
index be6cf63a9..76bc5be8d 100644
--- a/setup.py
+++ b/setup.py
@@ -37,9 +37,7 @@ base = [
     "g2pM",
     "h5py",
     "inflect",
-    "jieba",
     "jsonlines",
-    "kaldiio",
     "librosa==0.8.1",
     "loguru",
     "matplotlib",
@@ -51,22 +49,16 @@ base = [
     "paddlenlp>=2.4.8",
     "ppdiffusers>=0.9.0",
     "paddlespeech_feat",
-    "Pillow>=9.0.0",
-    "praatio==5.0.0",
+    "praatio>=5.0.0",
     "pypinyin<=0.44.0",
     "pypinyin-dict",
     "python-dateutil",
-    "pyworld==0.2.12",
-    "resampy==0.2.2",
+    "pyworld>=0.2.12",
+    "resampy",
     "sacrebleu",
-    "scipy",
-    "sentencepiece~=0.1.96",
-    "soundfile~=0.10",
     "textgrid",
     "timer",
-    "tqdm",
     "typeguard",
-    "visualdl",
     "webrtcvad",
     "yacs~=0.1.8",
     "prettytable",
@@ -74,10 +66,10 @@ base = [
     "braceexpand",
     "pyyaml",
     "paddleslim>=2.3.4",
-    "paddleaudio>=1.0.2",
+    "paddleaudio>=1.1.0",
 ]
 
-server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"]
+server = ["pattern_singleton", "websockets"]
 
 requirements = {
     "install":
@@ -300,7 +292,7 @@ setup_info = dict(
     },
 
     # Package info
-    packages=find_packages(include=('paddlespeech*')),
+    packages=find_packages(include=['paddlespeech*'], exclude=['utils', 'third_party']),
     zip_safe=True,
     classifiers=[
         'Development Status :: 5 - Production/Stable',
diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh
index cb05a1d0f..9ff81bd8b 100755
--- a/tests/test_tipc/prepare.sh
+++ b/tests/test_tipc/prepare.sh
@@ -73,6 +73,9 @@ if [[ ${MODE} = "benchmark_train" ]];then
         mkdir -p BZNSYP
         unrar x BZNSYP.rar BZNSYP
         wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt
+        # 避免网络问题导致的 nltk_data 无法下载使程序 hang 住
+        wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz
+        tar -xzf nltk_data.tar.gz -C ${HOME}
         # 数据预处理
         python ../paddlespeech/t2s/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=../examples/csmsc/voc1/conf/default.yaml
         python ../utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats"
diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py
index c13f3df99..5ae5b3bf6 100644
--- a/third_party/ctc_decoders/setup.py
+++ b/third_party/ctc_decoders/setup.py
@@ -129,7 +129,7 @@ decoders_module = [
 
 setup(
     name='paddlespeech_ctcdecoders',
-    version='0.2.0',
+    version='0.2.2',
     description="CTC decoders in paddlespeech",
     author="PaddlePaddle Speech and Language Team",
     author_email="paddlesl@baidu.com",