Merge branch 'PaddlePaddle:develop' into diffusion

pull/2902/head^2
艾梦 3 years ago committed by GitHub
commit 50837f5a5b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

5
.github/stale.yml vendored

@ -6,7 +6,8 @@ daysUntilClose: 30
exemptLabels:
- Roadmap
- Bug
- New Feature
- feature request
- Tips
# Label to use when marking an issue as stale
staleLabel: Stale
# Comment to post when marking an issue as stale. Set to `false` to disable
@ -17,4 +18,4 @@ markComment: >
unmarkComment: false
# Comment to post when closing a stale issue. Set to `false` to disable
closeComment: >
This issue is closed. Please re-open if needed.
This issue is closed. Please re-open if needed.

@ -40,14 +40,9 @@ COMMITID = 'none'
base = [
"kaldiio",
"librosa==0.8.1",
"scipy>=1.0.0",
"soundfile~=0.10",
"colorlog",
"pathos==0.2.8",
"pathos",
"pybind11",
"parameterized",
"tqdm",
"scikit-learn"
]
requirements = {
@ -273,7 +268,7 @@ def main():
},
# Package info
packages=find_packages(include=('paddleaudio*')),
packages=find_packages(include=['paddleaudio*']),
package_data=lib_package_data,
ext_modules=setup_helpers.get_ext_modules(),
zip_safe=True,

@ -1,8 +1,6 @@
aiofiles
faiss-cpu
praatio==5.0.0
praatio>=5.0.0
pydantic
python-multipart
scikit_learn
starlette
uvicorn

@ -1,11 +1,9 @@
braceexpand
editdistance
fastapi
g2p_en
g2pM
h5py
inflect
jieba
jsonlines
kaldiio
keyboard
@ -24,30 +22,23 @@ paddlespeech_ctcdecoders
paddlespeech_feat
pandas
pattern_singleton
Pillow>=9.0.0
ppdiffusers>=0.9.0
praatio==5.0.0
praatio>=5.0.0
prettytable
pypinyin-dict
pypinyin<=0.44.0
python-dateutil
pyworld==0.2.12
pyworld>=0.2.12
recommonmark>=0.5.0
resampy==0.2.2
resampy
sacrebleu
scipy
sentencepiece~=0.1.96
soundfile~=0.10
sphinx
sphinx-autobuild
sphinx-markdown-tables
sphinx_rtd_theme
textgrid
timer
tqdm
typeguard
uvicorn
visualdl
webrtcvad
websockets
yacs~=0.1.8

@ -147,14 +147,14 @@ optional arguments:
The pretrained model can be downloaded here:
- [vits_csmsc_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.1.0.zip) (add_blank=true)
- [vits_csmsc_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.4.0.zip) (add_blank=true)
VITS checkpoint contains files listed below.
```text
vits_csmsc_ckpt_1.1.0
├── default.yaml # default config used to train vitx
├── phone_id_map.txt # phone vocabulary file when training vits
└── snapshot_iter_333000.pdz # model parameters and optimizer states
vits_csmsc_ckpt_1.4.0
├── default.yaml # default config used to train vitx
├── phone_id_map.txt # phone vocabulary file when training vits
└── snapshot_iter_150000.pdz # model parameters and optimizer states
```
ps: This ckpt is not good enough, a better result is training
@ -168,9 +168,9 @@ add_blank=true
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/synthesize_e2e.py \
--config=vits_csmsc_ckpt_1.1.0/default.yaml \
--ckpt=vits_csmsc_ckpt_1.1.0/snapshot_iter_333000.pdz \
--phones_dict=vits_csmsc_ckpt_1.1.0/phone_id_map.txt \
--config=vits_csmsc_ckpt_1.4.0/default.yaml \
--ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \
--phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \
--output_dir=exp/default/test_e2e \
--text=${BIN_DIR}/../sentences.txt \
--add-blank=${add_blank}

@ -121,7 +121,7 @@ if __name__ == "__main__":
optimizer.clear_grad()
# Calculate loss
avg_loss += loss.numpy()[0]
avg_loss += float(loss)
# Calculate metrics
preds = paddle.argmax(logits, axis=1)

@ -127,7 +127,7 @@ class TextExecutor(BaseExecutor):
if self.task == 'punc':
# punc list
self._punc_list = []
with open(self.vocab_file, 'r') as f:
with open(self.vocab_file, 'r', encoding='utf-8') as f:
for line in f:
self._punc_list.append(line.strip())
@ -178,12 +178,12 @@ class TextExecutor(BaseExecutor):
if self.task == 'punc':
# punc list
self._punc_list = []
with open(self.vocab_file, 'r') as f:
with open(self.vocab_file, 'r', encoding='utf-8') as f:
for line in f:
self._punc_list.append(line.strip())
# model
with open(self.cfg_path) as f:
with open(self.cfg_path, 'r', encoding='utf-8') as f:
config = CfgNode(yaml.safe_load(f))
self.model = ErnieLinear(**config["model"])

@ -292,19 +292,19 @@ class TTSExecutor(BaseExecutor):
with open(self.voc_config) as f:
self.voc_config = CfgNode(yaml.safe_load(f))
with open(self.phones_dict, "r") as f:
with open(self.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
tone_size = None
if self.tones_dict:
with open(self.tones_dict, "r") as f:
with open(self.tones_dict, 'rt', encoding='utf-8') as f:
tone_id = [line.strip().split() for line in f.readlines()]
tone_size = len(tone_id)
spk_num = None
if self.speaker_dict:
with open(self.speaker_dict, 'rt') as f:
with open(self.speaker_dict, 'rt', encoding='utf-8') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)

@ -152,8 +152,7 @@ class WhisperExecutor(BaseExecutor):
Init model and other resources from a specific path.
"""
logger.debug("start to init the model")
# default max_len: unit:second
self.max_len = 50
if hasattr(self, 'model'):
logger.debug('Model had been initialized.')
return
@ -339,12 +338,6 @@ class WhisperExecutor(BaseExecutor):
try:
audio, audio_sample_rate = soundfile.read(
audio_file, dtype="int16", always_2d=True)
audio_duration = audio.shape[0] / audio_sample_rate
if audio_duration > self.max_len:
logger.error(
f"Please input audio file less then {self.max_len} seconds.\n"
)
return False
except Exception as e:
logger.exception(e)
logger.error(

@ -1,5 +1,5 @@
# MIT License, Copyright (c) 2022 OpenAI.
# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py)
from paddlespeech.s2t.models.whisper.whipser import decode

@ -1,5 +1,5 @@
# MIT License, Copyright (c) 2022 OpenAI.
# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/tokenizer.py)
import os

@ -1,5 +1,5 @@
# MIT License, Copyright (c) 2022 OpenAI.
# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/utils.py)
import zlib

@ -1,5 +1,5 @@
# MIT License, Copyright (c) 2022 OpenAI.
# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper)
import os

@ -437,7 +437,7 @@ if __name__ == '__main__':
vocab_phones = {}
with open(args.phones_dict, 'rt') as f:
with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
for phn, id in phn_id:
vocab_phones[phn] = int(id)

@ -109,7 +109,7 @@ def train_sp(args, config):
num_workers=config.num_workers)
print("dataloaders done!")
with open(args.phones_dict, "r") as f:
with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)

@ -67,7 +67,7 @@ def train_sp(args, config):
if args.speaker_dict is not None:
print("multiple speaker fastspeech2!")
collate_fn = fastspeech2_multi_spk_batch_fn
with open(args.speaker_dict, 'rt') as f:
with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
fields += ["spk_id"]
@ -123,7 +123,7 @@ def train_sp(args, config):
num_workers=config.num_workers)
print("dataloaders done!")
with open(args.phones_dict, "r") as f:
with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)

@ -39,18 +39,18 @@ def evaluate(args, speedyspeech_config, pwg_config):
# construct dataset for evaluation
sentences = []
with open(args.text, 'rt') as f:
with open(args.text, 'rt', encoding='utf-8') as f:
for line in f:
items = line.strip().split()
utt_id = items[0]
sentence = "".join(items[1:])
sentences.append((utt_id, sentence))
with open(args.phones_dict, "r") as f:
with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
with open(args.tones_dict, "r") as f:
with open(args.tones_dict, 'rt', encoding='utf-8') as f:
tone_id = [line.strip().split() for line in f.readlines()]
tone_size = len(tone_id)
print("tone_size:", tone_size)

@ -70,7 +70,7 @@ def train_sp(args, config):
if args.speaker_dict is not None:
print("multiple speaker speedyspeech!")
collate_fn = speedyspeech_multi_spk_batch_fn
with open(args.speaker_dict, 'rt') as f:
with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
fields += ["spk_id"]
@ -133,11 +133,11 @@ def train_sp(args, config):
collate_fn=collate_fn,
num_workers=config.num_workers)
print("dataloaders done!")
with open(args.phones_dict, "r") as f:
with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
with open(args.tones_dict, "r") as f:
with open(args.tones_dict, 'rt', encoding='utf-8') as f:
tone_id = [line.strip().split() for line in f.readlines()]
tone_size = len(tone_id)
print("tone_size:", tone_size)

@ -106,7 +106,7 @@ def get_chunks(data, block_size: int, pad_size: int):
def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
# construct dataset for evaluation
sentences = []
with open(text_file, 'rt') as f:
with open(text_file, 'rt', encoding='utf-8') as f:
for line in f:
if line.strip() != "":
items = re.split(r"\s+", line.strip(), 1)
@ -325,17 +325,17 @@ def get_am_inference(am: str='fastspeech2_csmsc',
tones_dict: Optional[os.PathLike]=None,
speaker_dict: Optional[os.PathLike]=None,
return_am: bool=False):
with open(phones_dict, "r") as f:
with open(phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
tone_size = None
if tones_dict is not None:
with open(tones_dict, "r") as f:
with open(tones_dict, 'rt', encoding='utf-8') as f:
tone_id = [line.strip().split() for line in f.readlines()]
tone_size = len(tone_id)
spk_num = None
if speaker_dict is not None:
with open(speaker_dict, 'rt') as f:
with open(speaker_dict, 'rt', encoding='utf-8') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
odim = am_config.n_mels

@ -119,7 +119,7 @@ def train_sp(args, config):
num_workers=config.num_workers)
print("dataloaders done!")
with open(args.phones_dict, "r") as f:
with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)

@ -114,7 +114,7 @@ def train_sp(args, config):
num_workers=config.num_workers)
print("dataloaders done!")
with open(args.phones_dict, "r") as f:
with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)

@ -78,7 +78,7 @@ def train_sp(args, config):
if args.speaker_dict is not None:
print("multiple speaker vits!")
collate_fn = vits_multi_spk_batch_fn
with open(args.speaker_dict, 'rt') as f:
with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
fields += ["spk_id"]
@ -132,7 +132,7 @@ def train_sp(args, config):
num_workers=config.num_workers)
print("dataloaders done!")
with open(args.phones_dict, "r") as f:
with open(args.phones_dict, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)

@ -58,7 +58,7 @@ class English(Phonetics):
self.punc = ":,;。?!“”‘’':,;.?!"
self.text_normalizer = TextNormalizer()
if phone_vocab_path:
with open(phone_vocab_path, 'rt') as f:
with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
for phn, id in phn_id:
self.vocab_phones[phn] = int(id)

@ -144,12 +144,12 @@ class Frontend():
self.vocab_phones = {}
self.vocab_tones = {}
if phone_vocab_path:
with open(phone_vocab_path, 'rt') as f:
with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()]
for phn, id in phn_id:
self.vocab_phones[phn] = int(id)
if tone_vocab_path:
with open(tone_vocab_path, 'rt') as f:
with open(tone_vocab_path, 'rt', encoding='utf-8') as f:
tone_id = [line.strip().split() for line in f.readlines()]
for tone, id in tone_id:
self.vocab_tones[tone] = int(id)

@ -113,16 +113,18 @@ class Tacotron2Updater(StandardUpdater):
loss.backward()
optimizer.step()
if self.use_guided_attn_loss:
report("train/attn_loss", float(attn_loss))
losses_dict["attn_loss"] = float(attn_loss)
report("train/l1_loss", float(l1_loss))
report("train/mse_loss", float(mse_loss))
report("train/bce_loss", float(bce_loss))
report("train/attn_loss", float(attn_loss))
report("train/loss", float(loss))
losses_dict["l1_loss"] = float(l1_loss)
losses_dict["mse_loss"] = float(mse_loss)
losses_dict["bce_loss"] = float(bce_loss)
losses_dict["attn_loss"] = float(attn_loss)
losses_dict["loss"] = float(loss)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
@ -202,17 +204,19 @@ class Tacotron2Evaluator(StandardEvaluator):
attn_loss = self.attn_loss(
att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
loss = loss + attn_loss
if self.use_guided_attn_loss:
report("eval/attn_loss", float(attn_loss))
losses_dict["attn_loss"] = float(attn_loss)
report("eval/l1_loss", float(l1_loss))
report("eval/mse_loss", float(mse_loss))
report("eval/bce_loss", float(bce_loss))
report("eval/attn_loss", float(attn_loss))
report("eval/loss", float(loss))
losses_dict["l1_loss"] = float(l1_loss)
losses_dict["mse_loss"] = float(mse_loss)
losses_dict["bce_loss"] = float(bce_loss)
losses_dict["attn_loss"] = float(attn_loss)
losses_dict["loss"] = float(loss)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())

@ -37,9 +37,7 @@ base = [
"g2pM",
"h5py",
"inflect",
"jieba",
"jsonlines",
"kaldiio",
"librosa==0.8.1",
"loguru",
"matplotlib",
@ -51,22 +49,16 @@ base = [
"paddlenlp>=2.4.8",
"ppdiffusers>=0.9.0",
"paddlespeech_feat",
"Pillow>=9.0.0",
"praatio==5.0.0",
"praatio>=5.0.0",
"pypinyin<=0.44.0",
"pypinyin-dict",
"python-dateutil",
"pyworld==0.2.12",
"resampy==0.2.2",
"pyworld>=0.2.12",
"resampy",
"sacrebleu",
"scipy",
"sentencepiece~=0.1.96",
"soundfile~=0.10",
"textgrid",
"timer",
"tqdm",
"typeguard",
"visualdl",
"webrtcvad",
"yacs~=0.1.8",
"prettytable",
@ -74,10 +66,10 @@ base = [
"braceexpand",
"pyyaml",
"paddleslim>=2.3.4",
"paddleaudio>=1.0.2",
"paddleaudio>=1.1.0",
]
server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"]
server = ["pattern_singleton", "websockets"]
requirements = {
"install":
@ -300,7 +292,7 @@ setup_info = dict(
},
# Package info
packages=find_packages(include=('paddlespeech*')),
packages=find_packages(include=['paddlespeech*'], exclude=['utils', 'third_party']),
zip_safe=True,
classifiers=[
'Development Status :: 5 - Production/Stable',

@ -73,6 +73,9 @@ if [[ ${MODE} = "benchmark_train" ]];then
mkdir -p BZNSYP
unrar x BZNSYP.rar BZNSYP
wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt
# 避免网络问题导致的 nltk_data 无法下载使程序 hang 住
wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz
tar -xzf nltk_data.tar.gz -C ${HOME}
# 数据预处理
python ../paddlespeech/t2s/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=../examples/csmsc/voc1/conf/default.yaml
python ../utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats"

@ -129,7 +129,7 @@ decoders_module = [
setup(
name='paddlespeech_ctcdecoders',
version='0.2.0',
version='0.2.2',
description="CTC decoders in paddlespeech",
author="PaddlePaddle Speech and Language Team",
author_email="paddlesl@baidu.com",

Loading…
Cancel
Save