diff --git a/.github/stale.yml b/.github/stale.yml index da19b6606..6b0da9b98 100644 --- a/.github/stale.yml +++ b/.github/stale.yml @@ -6,7 +6,8 @@ daysUntilClose: 30 exemptLabels: - Roadmap - Bug - - New Feature + - feature request + - Tips # Label to use when marking an issue as stale staleLabel: Stale # Comment to post when marking an issue as stale. Set to `false` to disable @@ -17,4 +18,4 @@ markComment: > unmarkComment: false # Comment to post when closing a stale issue. Set to `false` to disable closeComment: > - This issue is closed. Please re-open if needed. \ No newline at end of file + This issue is closed. Please re-open if needed. diff --git a/audio/setup.py b/audio/setup.py index d36b2c440..823e5dfad 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -40,14 +40,9 @@ COMMITID = 'none' base = [ "kaldiio", "librosa==0.8.1", - "scipy>=1.0.0", - "soundfile~=0.10", - "colorlog", - "pathos==0.2.8", + "pathos", "pybind11", "parameterized", - "tqdm", - "scikit-learn" ] requirements = { @@ -273,7 +268,7 @@ def main(): }, # Package info - packages=find_packages(include=('paddleaudio*')), + packages=find_packages(include=['paddleaudio*']), package_data=lib_package_data, ext_modules=setup_helpers.get_ext_modules(), zip_safe=True, diff --git a/demos/speech_web/speech_server/requirements.txt b/demos/speech_web/speech_server/requirements.txt index cdc654656..8425a1fee 100644 --- a/demos/speech_web/speech_server/requirements.txt +++ b/demos/speech_web/speech_server/requirements.txt @@ -1,8 +1,6 @@ aiofiles faiss-cpu -praatio==5.0.0 +praatio>=5.0.0 pydantic python-multipart -scikit_learn starlette -uvicorn diff --git a/docs/requirements.txt b/docs/requirements.txt index 5422c26f9..609f27925 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,11 +1,9 @@ braceexpand editdistance -fastapi g2p_en g2pM h5py inflect -jieba jsonlines kaldiio keyboard @@ -24,30 +22,23 @@ paddlespeech_ctcdecoders paddlespeech_feat pandas pattern_singleton -Pillow>=9.0.0 ppdiffusers>=0.9.0 -praatio==5.0.0 +praatio>=5.0.0 prettytable pypinyin-dict pypinyin<=0.44.0 python-dateutil -pyworld==0.2.12 +pyworld>=0.2.12 recommonmark>=0.5.0 -resampy==0.2.2 +resampy sacrebleu -scipy -sentencepiece~=0.1.96 -soundfile~=0.10 sphinx sphinx-autobuild sphinx-markdown-tables sphinx_rtd_theme textgrid timer -tqdm typeguard -uvicorn -visualdl webrtcvad websockets yacs~=0.1.8 diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md index 8f223e07b..50d703b2d 100644 --- a/examples/csmsc/vits/README.md +++ b/examples/csmsc/vits/README.md @@ -147,14 +147,14 @@ optional arguments: The pretrained model can be downloaded here: -- [vits_csmsc_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.1.0.zip) (add_blank=true) +- [vits_csmsc_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.4.0.zip) (add_blank=true) VITS checkpoint contains files listed below. ```text -vits_csmsc_ckpt_1.1.0 -├── default.yaml # default config used to train vitx -├── phone_id_map.txt # phone vocabulary file when training vits -└── snapshot_iter_333000.pdz # model parameters and optimizer states +vits_csmsc_ckpt_1.4.0 +├── default.yaml # default config used to train vitx +├── phone_id_map.txt # phone vocabulary file when training vits +└── snapshot_iter_150000.pdz # model parameters and optimizer states ``` ps: This ckpt is not good enough, a better result is training @@ -168,9 +168,9 @@ add_blank=true FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/synthesize_e2e.py \ - --config=vits_csmsc_ckpt_1.1.0/default.yaml \ - --ckpt=vits_csmsc_ckpt_1.1.0/snapshot_iter_333000.pdz \ - --phones_dict=vits_csmsc_ckpt_1.1.0/phone_id_map.txt \ + --config=vits_csmsc_ckpt_1.4.0/default.yaml \ + --ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \ + --phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \ --output_dir=exp/default/test_e2e \ --text=${BIN_DIR}/../sentences.txt \ --add-blank=${add_blank} diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py index 25382d8c3..f023a37b7 100644 --- a/examples/tess/cls0/local/train.py +++ b/examples/tess/cls0/local/train.py @@ -121,7 +121,7 @@ if __name__ == "__main__": optimizer.clear_grad() # Calculate loss - avg_loss += loss.numpy()[0] + avg_loss += float(loss) # Calculate metrics preds = paddle.argmax(logits, axis=1) diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index ff822f674..bd76a13d0 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -127,7 +127,7 @@ class TextExecutor(BaseExecutor): if self.task == 'punc': # punc list self._punc_list = [] - with open(self.vocab_file, 'r') as f: + with open(self.vocab_file, 'r', encoding='utf-8') as f: for line in f: self._punc_list.append(line.strip()) @@ -178,12 +178,12 @@ class TextExecutor(BaseExecutor): if self.task == 'punc': # punc list self._punc_list = [] - with open(self.vocab_file, 'r') as f: + with open(self.vocab_file, 'r', encoding='utf-8') as f: for line in f: self._punc_list.append(line.strip()) # model - with open(self.cfg_path) as f: + with open(self.cfg_path, 'r', encoding='utf-8') as f: config = CfgNode(yaml.safe_load(f)) self.model = ErnieLinear(**config["model"]) diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 707518c05..5515ade26 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -292,19 +292,19 @@ class TTSExecutor(BaseExecutor): with open(self.voc_config) as f: self.voc_config = CfgNode(yaml.safe_load(f)) - with open(self.phones_dict, "r") as f: + with open(self.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) tone_size = None if self.tones_dict: - with open(self.tones_dict, "r") as f: + with open(self.tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) spk_num = None if self.speaker_dict: - with open(self.speaker_dict, 'rt') as f: + with open(self.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) diff --git a/paddlespeech/cli/whisper/infer.py b/paddlespeech/cli/whisper/infer.py index c016b453a..ebcca890b 100644 --- a/paddlespeech/cli/whisper/infer.py +++ b/paddlespeech/cli/whisper/infer.py @@ -152,8 +152,7 @@ class WhisperExecutor(BaseExecutor): Init model and other resources from a specific path. """ logger.debug("start to init the model") - # default max_len: unit:second - self.max_len = 50 + if hasattr(self, 'model'): logger.debug('Model had been initialized.') return @@ -339,12 +338,6 @@ class WhisperExecutor(BaseExecutor): try: audio, audio_sample_rate = soundfile.read( audio_file, dtype="int16", always_2d=True) - audio_duration = audio.shape[0] / audio_sample_rate - if audio_duration > self.max_len: - logger.error( - f"Please input audio file less then {self.max_len} seconds.\n" - ) - return False except Exception as e: logger.exception(e) logger.error( diff --git a/paddlespeech/s2t/models/whisper/__init__.py b/paddlespeech/s2t/models/whisper/__init__.py index 98ab23610..b78dece8a 100644 --- a/paddlespeech/s2t/models/whisper/__init__.py +++ b/paddlespeech/s2t/models/whisper/__init__.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py) from paddlespeech.s2t.models.whisper.whipser import decode diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py index 1e1aea044..e8b201bcc 100644 --- a/paddlespeech/s2t/models/whisper/tokenizer.py +++ b/paddlespeech/s2t/models/whisper/tokenizer.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/tokenizer.py) import os diff --git a/paddlespeech/s2t/models/whisper/utils.py b/paddlespeech/s2t/models/whisper/utils.py index d067af7d2..5528f9604 100644 --- a/paddlespeech/s2t/models/whisper/utils.py +++ b/paddlespeech/s2t/models/whisper/utils.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/utils.py) import zlib diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py index 9cf9a9eca..a28013e4b 100644 --- a/paddlespeech/s2t/models/whisper/whipser.py +++ b/paddlespeech/s2t/models/whisper/whipser.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper) import os diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py index e450aa1a0..c43dafb3c 100644 --- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py @@ -437,7 +437,7 @@ if __name__ == '__main__': vocab_phones = {} - with open(args.phones_dict, 'rt') as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: vocab_phones[phn] = int(id) diff --git a/paddlespeech/t2s/exps/ernie_sat/train.py b/paddlespeech/t2s/exps/ernie_sat/train.py index 75a666bb1..c98d691be 100644 --- a/paddlespeech/t2s/exps/ernie_sat/train.py +++ b/paddlespeech/t2s/exps/ernie_sat/train.py @@ -109,7 +109,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index d31e62a82..97626db0b 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -67,7 +67,7 @@ def train_sp(args, config): if args.speaker_dict is not None: print("multiple speaker fastspeech2!") collate_fn = fastspeech2_multi_spk_batch_fn - with open(args.speaker_dict, 'rt') as f: + with open(args.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) fields += ["spk_id"] @@ -123,7 +123,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py index 644ec250d..d05dfafcf 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py @@ -39,18 +39,18 @@ def evaluate(args, speedyspeech_config, pwg_config): # construct dataset for evaluation sentences = [] - with open(args.text, 'rt') as f: + with open(args.text, 'rt', encoding='utf-8') as f: for line in f: items = line.strip().split() utt_id = items[0] sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) - with open(args.tones_dict, "r") as f: + with open(args.tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) print("tone_size:", tone_size) diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index 7b422e64f..c90090daa 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -70,7 +70,7 @@ def train_sp(args, config): if args.speaker_dict is not None: print("multiple speaker speedyspeech!") collate_fn = speedyspeech_multi_spk_batch_fn - with open(args.speaker_dict, 'rt') as f: + with open(args.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) fields += ["spk_id"] @@ -133,11 +133,11 @@ def train_sp(args, config): collate_fn=collate_fn, num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) - with open(args.tones_dict, "r") as f: + with open(args.tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) print("tone_size:", tone_size) diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 6b693440c..491edda30 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -106,7 +106,7 @@ def get_chunks(data, block_size: int, pad_size: int): def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): # construct dataset for evaluation sentences = [] - with open(text_file, 'rt') as f: + with open(text_file, 'rt', encoding='utf-8') as f: for line in f: if line.strip() != "": items = re.split(r"\s+", line.strip(), 1) @@ -325,17 +325,17 @@ def get_am_inference(am: str='fastspeech2_csmsc', tones_dict: Optional[os.PathLike]=None, speaker_dict: Optional[os.PathLike]=None, return_am: bool=False): - with open(phones_dict, "r") as f: + with open(phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) tone_size = None if tones_dict is not None: - with open(tones_dict, "r") as f: + with open(tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) spk_num = None if speaker_dict is not None: - with open(speaker_dict, 'rt') as f: + with open(speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) odim = am_config.n_mels diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py index 69ff80e46..db88009a8 100644 --- a/paddlespeech/t2s/exps/tacotron2/train.py +++ b/paddlespeech/t2s/exps/tacotron2/train.py @@ -119,7 +119,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index da48b6b99..d49baad99 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -114,7 +114,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index f6a31ced2..0e74bf631 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -78,7 +78,7 @@ def train_sp(args, config): if args.speaker_dict is not None: print("multiple speaker vits!") collate_fn = vits_multi_spk_batch_fn - with open(args.speaker_dict, 'rt') as f: + with open(args.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) fields += ["spk_id"] @@ -132,7 +132,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py index 261db80a8..af86d9b80 100644 --- a/paddlespeech/t2s/frontend/phonectic.py +++ b/paddlespeech/t2s/frontend/phonectic.py @@ -58,7 +58,7 @@ class English(Phonetics): self.punc = ":,;。?!“”‘’':,;.?!" self.text_normalizer = TextNormalizer() if phone_vocab_path: - with open(phone_vocab_path, 'rt') as f: + with open(phone_vocab_path, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: self.vocab_phones[phn] = int(id) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index efb673e36..35b97a93a 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -144,12 +144,12 @@ class Frontend(): self.vocab_phones = {} self.vocab_tones = {} if phone_vocab_path: - with open(phone_vocab_path, 'rt') as f: + with open(phone_vocab_path, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: self.vocab_phones[phn] = int(id) if tone_vocab_path: - with open(tone_vocab_path, 'rt') as f: + with open(tone_vocab_path, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] for tone, id in tone_id: self.vocab_tones[tone] = int(id) diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py index 09e6827d0..1db9248ae 100644 --- a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py +++ b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py @@ -113,16 +113,18 @@ class Tacotron2Updater(StandardUpdater): loss.backward() optimizer.step() + if self.use_guided_attn_loss: + report("train/attn_loss", float(attn_loss)) + losses_dict["attn_loss"] = float(attn_loss) + report("train/l1_loss", float(l1_loss)) report("train/mse_loss", float(mse_loss)) report("train/bce_loss", float(bce_loss)) - report("train/attn_loss", float(attn_loss)) report("train/loss", float(loss)) losses_dict["l1_loss"] = float(l1_loss) losses_dict["mse_loss"] = float(mse_loss) losses_dict["bce_loss"] = float(bce_loss) - losses_dict["attn_loss"] = float(attn_loss) losses_dict["loss"] = float(loss) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) @@ -202,17 +204,19 @@ class Tacotron2Evaluator(StandardEvaluator): attn_loss = self.attn_loss( att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in) loss = loss + attn_loss + + if self.use_guided_attn_loss: + report("eval/attn_loss", float(attn_loss)) + losses_dict["attn_loss"] = float(attn_loss) report("eval/l1_loss", float(l1_loss)) report("eval/mse_loss", float(mse_loss)) report("eval/bce_loss", float(bce_loss)) - report("eval/attn_loss", float(attn_loss)) report("eval/loss", float(loss)) losses_dict["l1_loss"] = float(l1_loss) losses_dict["mse_loss"] = float(mse_loss) losses_dict["bce_loss"] = float(bce_loss) - losses_dict["attn_loss"] = float(attn_loss) losses_dict["loss"] = float(loss) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) diff --git a/setup.py b/setup.py index be6cf63a9..76bc5be8d 100644 --- a/setup.py +++ b/setup.py @@ -37,9 +37,7 @@ base = [ "g2pM", "h5py", "inflect", - "jieba", "jsonlines", - "kaldiio", "librosa==0.8.1", "loguru", "matplotlib", @@ -51,22 +49,16 @@ base = [ "paddlenlp>=2.4.8", "ppdiffusers>=0.9.0", "paddlespeech_feat", - "Pillow>=9.0.0", - "praatio==5.0.0", + "praatio>=5.0.0", "pypinyin<=0.44.0", "pypinyin-dict", "python-dateutil", - "pyworld==0.2.12", - "resampy==0.2.2", + "pyworld>=0.2.12", + "resampy", "sacrebleu", - "scipy", - "sentencepiece~=0.1.96", - "soundfile~=0.10", "textgrid", "timer", - "tqdm", "typeguard", - "visualdl", "webrtcvad", "yacs~=0.1.8", "prettytable", @@ -74,10 +66,10 @@ base = [ "braceexpand", "pyyaml", "paddleslim>=2.3.4", - "paddleaudio>=1.0.2", + "paddleaudio>=1.1.0", ] -server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"] +server = ["pattern_singleton", "websockets"] requirements = { "install": @@ -300,7 +292,7 @@ setup_info = dict( }, # Package info - packages=find_packages(include=('paddlespeech*')), + packages=find_packages(include=['paddlespeech*'], exclude=['utils', 'third_party']), zip_safe=True, classifiers=[ 'Development Status :: 5 - Production/Stable', diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh index cb05a1d0f..9ff81bd8b 100755 --- a/tests/test_tipc/prepare.sh +++ b/tests/test_tipc/prepare.sh @@ -73,6 +73,9 @@ if [[ ${MODE} = "benchmark_train" ]];then mkdir -p BZNSYP unrar x BZNSYP.rar BZNSYP wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt + # 避免网络问题导致的 nltk_data 无法下载使程序 hang 住 + wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz + tar -xzf nltk_data.tar.gz -C ${HOME} # 数据预处理 python ../paddlespeech/t2s/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=../examples/csmsc/voc1/conf/default.yaml python ../utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py index c13f3df99..5ae5b3bf6 100644 --- a/third_party/ctc_decoders/setup.py +++ b/third_party/ctc_decoders/setup.py @@ -129,7 +129,7 @@ decoders_module = [ setup( name='paddlespeech_ctcdecoders', - version='0.2.0', + version='0.2.2', description="CTC decoders in paddlespeech", author="PaddlePaddle Speech and Language Team", author_email="paddlesl@baidu.com",