Merge branch 'develop' of github.com:iftaken/PaddleSpeech into dev-web

pull/2389/head
iftaken 3 years ago
commit b8c55c48a5

@ -61,7 +61,7 @@ tts_python:
phones_dict: phones_dict:
tones_dict: tones_dict:
speaker_dict: speaker_dict:
spk_id: 0
# voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
# 'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc', # 'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc',
@ -87,7 +87,7 @@ tts_inference:
phones_dict: phones_dict:
tones_dict: tones_dict:
speaker_dict: speaker_dict:
spk_id: 0
am_predictor_conf: am_predictor_conf:
device: # set 'gpu:id' or 'cpu' device: # set 'gpu:id' or 'cpu'

@ -29,7 +29,7 @@ tts_online:
phones_dict: phones_dict:
tones_dict: tones_dict:
speaker_dict: speaker_dict:
spk_id: 0
# voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc'] # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
# Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
@ -70,7 +70,6 @@ tts_online-onnx:
phones_dict: phones_dict:
tones_dict: tones_dict:
speaker_dict: speaker_dict:
spk_id: 0
am_sample_rate: 24000 am_sample_rate: 24000
am_sess_conf: am_sess_conf:
device: "cpu" # set 'gpu:id' or 'cpu' device: "cpu" # set 'gpu:id' or 'cpu'

@ -29,7 +29,7 @@ tts_online:
phones_dict: phones_dict:
tones_dict: tones_dict:
speaker_dict: speaker_dict:
spk_id: 0
# voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc'] # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
# Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
@ -70,7 +70,6 @@ tts_online-onnx:
phones_dict: phones_dict:
tones_dict: tones_dict:
speaker_dict: speaker_dict:
spk_id: 0
am_sample_rate: 24000 am_sample_rate: 24000
am_sess_conf: am_sess_conf:
device: "cpu" # set 'gpu:id' or 'cpu' device: "cpu" # set 'gpu:id' or 'cpu'

@ -5,6 +5,7 @@
- [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf) - [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf)
- [Polyphone Disambiguation in Mandarin Chinese with Semi-Supervised Learning](https://www.isca-speech.org/archive/pdfs/interspeech_2021/shi21d_interspeech.pdf) - [Polyphone Disambiguation in Mandarin Chinese with Semi-Supervised Learning](https://www.isca-speech.org/archive/pdfs/interspeech_2021/shi21d_interspeech.pdf)
* github: https://github.com/PaperMechanica/SemiPPL * github: https://github.com/PaperMechanica/SemiPPL
- [WikipediaHomographData](https://github.com/google-research-datasets/WikipediaHomographData)
### Text Normalization ### Text Normalization
#### English #### English
- [applenob/text_normalization](https://github.com/applenob/text_normalization) - [applenob/text_normalization](https://github.com/applenob/text_normalization)

@ -14,8 +14,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \ python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \ --task_name=synthesize \
--wav_path=source/SSB03540307.wav \ --wav_path=source/SSB03540307.wav \
--old_str='请播放歌曲小苹果' \ --old_str='请播放歌曲小苹果' \
--new_str='歌曲真好听' \ --new_str='歌曲真好听' \
--source_lang=zh \ --source_lang=zh \
--target_lang=zh \ --target_lang=zh \
--erniesat_config=${config_path} \ --erniesat_config=${config_path} \

@ -29,9 +29,11 @@ Or train your MFA model reference to [mfa example](https://github.com/PaddlePadd
Assume the paths to the datasets are: Assume the paths to the datasets are:
- `~/datasets/data_aishell3` - `~/datasets/data_aishell3`
- `~/datasets/VCTK-Corpus-0.92` - `~/datasets/VCTK-Corpus-0.92`
Assume the path to the MFA results of the datasets are: Assume the path to the MFA results of the datasets are:
- `./aishell3_alignment_tone` - `./aishell3_alignment_tone`
- `./vctk_alignment` - `./vctk_alignment`
Run the command below to Run the command below to
1. **source path**. 1. **source path**.
2. preprocess the dataset. 2. preprocess the dataset.

@ -15,7 +15,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \ python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \ --task_name=synthesize \
--wav_path=source/p243_313.wav \ --wav_path=source/p243_313.wav \
--old_str='For that reason cover should not be given.' \ --old_str='For that reason cover should not be given' \
--new_str='今天天气很好' \ --new_str='今天天气很好' \
--source_lang=en \ --source_lang=en \
--target_lang=zh \ --target_lang=zh \
@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \ python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \ --task_name=synthesize \
--wav_path=source/SSB03540307.wav \ --wav_path=source/SSB03540307.wav \
--old_str='请播放歌曲小苹果' \ --old_str='请播放歌曲小苹果' \
--new_str="Thank you!" \ --new_str="Thank you" \
--source_lang=zh \ --source_lang=zh \
--target_lang=en \ --target_lang=en \
--erniesat_config=${config_path} \ --erniesat_config=${config_path} \

@ -75,6 +75,15 @@ When "Prepare" done. The structure of the current directory is listed below.
``` ```
### Set finetune.yaml
`finetune.yaml` contains some configurations for fine-tuning. You can try various options to fine better result.
Arguments:
- `batch_size`: finetune batch size. Default: -1, means 64 which same to pretrained model
- `learning_rate`: learning rate. Default: 0.0001
- `num_snapshots`: number of save models. Default: -1, means 5 which same to pretrained model
- `frozen_layers`: frozen layers. must be a list. If you don't want to frozen any layer, set [].
## Get Started ## Get Started
Run the command below to Run the command below to

@ -14,6 +14,7 @@
import argparse import argparse
import os import os
from pathlib import Path from pathlib import Path
from typing import List
from typing import Union from typing import Union
import yaml import yaml
@ -21,10 +22,10 @@ from local.check_oov import get_check_result
from local.extract import extract_feature from local.extract import extract_feature
from local.label_process import get_single_label from local.label_process import get_single_label
from local.prepare_env import generate_finetune_env from local.prepare_env import generate_finetune_env
from local.train import train_sp
from paddle import distributed as dist from paddle import distributed as dist
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.t2s.exps.fastspeech2.train import train_sp
from utils.gen_duration_from_textgrid import gen_duration_from_textgrid from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
DICT_EN = 'tools/aligner/cmudict-0.7b' DICT_EN = 'tools/aligner/cmudict-0.7b'
@ -38,15 +39,24 @@ os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
class TrainArgs(): class TrainArgs():
def __init__(self, ngpu, config_file, dump_dir: Path, output_dir: Path): def __init__(self,
ngpu,
config_file,
dump_dir: Path,
output_dir: Path,
frozen_layers: List[str]):
# config: fastspeech2 config file.
self.config = str(config_file) self.config = str(config_file)
self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl") self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl")
self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl") self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl")
# model output dir.
self.output_dir = str(output_dir) self.output_dir = str(output_dir)
self.ngpu = ngpu self.ngpu = ngpu
self.phones_dict = str(dump_dir / "phone_id_map.txt") self.phones_dict = str(dump_dir / "phone_id_map.txt")
self.speaker_dict = str(dump_dir / "speaker_id_map.txt") self.speaker_dict = str(dump_dir / "speaker_id_map.txt")
self.voice_cloning = False self.voice_cloning = False
# frozen layers
self.frozen_layers = frozen_layers
def get_mfa_result( def get_mfa_result(
@ -122,12 +132,11 @@ if __name__ == '__main__':
"--ngpu", type=int, default=2, help="if ngpu=0, use cpu.") "--ngpu", type=int, default=2, help="if ngpu=0, use cpu.")
parser.add_argument("--epoch", type=int, default=100, help="finetune epoch") parser.add_argument("--epoch", type=int, default=100, help="finetune epoch")
parser.add_argument( parser.add_argument(
"--batch_size", "--finetune_config",
type=int, type=str,
default=-1, default="./finetune.yaml",
help="batch size, default -1 means same as pretrained model") help="Path to finetune config file")
args = parser.parse_args() args = parser.parse_args()
@ -147,8 +156,14 @@ if __name__ == '__main__':
with open(config_file) as f: with open(config_file) as f:
config = CfgNode(yaml.safe_load(f)) config = CfgNode(yaml.safe_load(f))
config.max_epoch = config.max_epoch + args.epoch config.max_epoch = config.max_epoch + args.epoch
if args.batch_size > 0:
config.batch_size = args.batch_size with open(args.finetune_config) as f2:
finetune_config = CfgNode(yaml.safe_load(f2))
config.batch_size = finetune_config.batch_size if finetune_config.batch_size > 0 else config.batch_size
config.optimizer.learning_rate = finetune_config.learning_rate if finetune_config.learning_rate > 0 else config.optimizer.learning_rate
config.num_snapshots = finetune_config.num_snapshots if finetune_config.num_snapshots > 0 else config.num_snapshots
frozen_layers = finetune_config.frozen_layers
assert type(frozen_layers) == list, "frozen_layers should be set a list."
if args.lang == 'en': if args.lang == 'en':
lexicon_file = DICT_EN lexicon_file = DICT_EN
@ -158,6 +173,13 @@ if __name__ == '__main__':
mfa_phone_file = MFA_PHONE_ZH mfa_phone_file = MFA_PHONE_ZH
else: else:
print('please input right lang!!') print('please input right lang!!')
print(f"finetune max_epoch: {config.max_epoch}")
print(f"finetune batch_size: {config.batch_size}")
print(f"finetune learning_rate: {config.optimizer.learning_rate}")
print(f"finetune num_snapshots: {config.num_snapshots}")
print(f"finetune frozen_layers: {frozen_layers}")
am_phone_file = pretrained_model_dir / "phone_id_map.txt" am_phone_file = pretrained_model_dir / "phone_id_map.txt"
label_file = input_dir / "labels.txt" label_file = input_dir / "labels.txt"
@ -181,7 +203,8 @@ if __name__ == '__main__':
generate_finetune_env(output_dir, pretrained_model_dir) generate_finetune_env(output_dir, pretrained_model_dir)
# create a new args for training # create a new args for training
train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir) train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir,
frozen_layers)
# finetune models # finetune models
# dispatch # dispatch

@ -0,0 +1,12 @@
###########################################################
# PARAS SETTING #
###########################################################
# Set to -1 to indicate that the parameter is the same as the pretrained model configuration
batch_size: -1
learning_rate: 0.0001 # learning rate
num_snapshots: -1
# frozen_layers should be a list
# if you don't need to freeze, set frozen_layers to []
frozen_layers: ["encoder", "duration_predictor"]

@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import logging import logging
import math
import os import os
from operator import itemgetter from operator import itemgetter
from pathlib import Path from pathlib import Path
@ -211,9 +210,9 @@ def extract_feature(duration_file: str,
mel_extractor, pitch_extractor, energy_extractor = get_extractor(config) mel_extractor, pitch_extractor, energy_extractor = get_extractor(config)
wav_files = sorted(list((input_dir).rglob("*.wav"))) wav_files = sorted(list((input_dir).rglob("*.wav")))
# split data into 3 sections, train: 80%, dev: 10%, test: 10% # split data into 3 sections, train: len(wav_files) - 2, dev: 1, test: 1
num_train = math.ceil(len(wav_files) * 0.8) num_train = len(wav_files) - 2
num_dev = math.ceil(len(wav_files) * 0.1) num_dev = 1
print(num_train, num_dev) print(num_train, num_dev)
train_wav_files = wav_files[:num_train] train_wav_files = wav_files[:num_train]

@ -0,0 +1,178 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import shutil
from pathlib import Path
from typing import List
import jsonlines
import numpy as np
import paddle
from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn
from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Evaluator
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Updater
from paddlespeech.t2s.training.extensions.snapshot import Snapshot
from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.optimizer import build_optimizers
from paddlespeech.t2s.training.seeding import seed_everything
from paddlespeech.t2s.training.trainer import Trainer
def freeze_layer(model, layers: List[str]):
"""freeze layers
Args:
layers (List[str]): frozen layers
"""
for layer in layers:
for param in eval("model." + layer + ".parameters()"):
param.trainable = False
def train_sp(args, config):
# decides device type and whether to run in parallel
# setup running environment correctly
if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
paddle.set_device("cpu")
else:
paddle.set_device("gpu")
world_size = paddle.distributed.get_world_size()
if world_size > 1:
paddle.distributed.init_parallel_env()
# set the random seed, it is a must for multiprocess training
seed_everything(config.seed)
print(
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
)
fields = [
"text", "text_lengths", "speech", "speech_lengths", "durations",
"pitch", "energy"
]
converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
spk_num = None
if args.speaker_dict is not None:
print("multiple speaker fastspeech2!")
collate_fn = fastspeech2_multi_spk_batch_fn
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id)
fields += ["spk_id"]
elif args.voice_cloning:
print("Training voice cloning!")
collate_fn = fastspeech2_multi_spk_batch_fn
fields += ["spk_emb"]
converters["spk_emb"] = np.load
else:
print("single speaker fastspeech2!")
collate_fn = fastspeech2_single_spk_batch_fn
print("spk_num:", spk_num)
# dataloader has been too verbose
logging.getLogger("DataLoader").disabled = True
# construct dataset for training and validation
with jsonlines.open(args.train_metadata, 'r') as reader:
train_metadata = list(reader)
train_dataset = DataTable(
data=train_metadata,
fields=fields,
converters=converters, )
with jsonlines.open(args.dev_metadata, 'r') as reader:
dev_metadata = list(reader)
dev_dataset = DataTable(
data=dev_metadata,
fields=fields,
converters=converters, )
# collate function and dataloader
train_sampler = DistributedBatchSampler(
train_dataset,
batch_size=config.batch_size,
shuffle=True,
drop_last=True)
print("samplers done!")
train_dataloader = DataLoader(
train_dataset,
batch_sampler=train_sampler,
collate_fn=collate_fn,
num_workers=config.num_workers)
dev_dataloader = DataLoader(
dev_dataset,
shuffle=False,
drop_last=False,
batch_size=config.batch_size,
collate_fn=collate_fn,
num_workers=config.num_workers)
print("dataloaders done!")
with open(args.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
odim = config.n_mels
model = FastSpeech2(
idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
# freeze layer
if args.frozen_layers != []:
freeze_layer(model, args.frozen_layers)
if world_size > 1:
model = DataParallel(model)
print("model done!")
optimizer = build_optimizers(model, **config["optimizer"])
print("optimizer done!")
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
if dist.get_rank() == 0:
config_name = args.config.split("/")[-1]
# copy conf to output_dir
shutil.copyfile(args.config, output_dir / config_name)
updater = FastSpeech2Updater(
model=model,
optimizer=optimizer,
dataloader=train_dataloader,
output_dir=output_dir,
**config["updater"])
trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
evaluator = FastSpeech2Evaluator(
model, dev_dataloader, output_dir=output_dir, **config["updater"])
if dist.get_rank() == 0:
trainer.extend(evaluator, trigger=(1, "epoch"))
trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
trainer.extend(
Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
trainer.run()

@ -10,11 +10,12 @@ mfa_dir=./mfa_result
dump_dir=./dump dump_dir=./dump
output_dir=./exp/default output_dir=./exp/default
lang=zh lang=zh
ngpu=2 ngpu=1
finetune_config=./finetune.yaml
ckpt=snapshot_iter_96600 ckpt=snapshot_iter_96699
gpus=0,1 gpus=1
CUDA_VISIBLE_DEVICES=${gpus} CUDA_VISIBLE_DEVICES=${gpus}
stage=0 stage=0
stop_stage=100 stop_stage=100
@ -35,7 +36,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--output_dir=${output_dir} \ --output_dir=${output_dir} \
--lang=${lang} \ --lang=${lang} \
--ngpu=${ngpu} \ --ngpu=${ngpu} \
--epoch=100 --epoch=100 \
--finetune_config=${finetune_config}
fi fi
@ -54,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \ --voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=./test_e2e \ --output_dir=./test_e2e/ \
--phones_dict=${dump_dir}/phone_id_map.txt \ --phones_dict=${dump_dir}/phone_id_map.txt \
--speaker_dict=${dump_dir}/speaker_id_map.txt \ --speaker_dict=${dump_dir}/speaker_id_map.txt \
--spk_id=0 --spk_id=0

@ -14,7 +14,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \ python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \ --task_name=synthesize \
--wav_path=source/p243_313.wav \ --wav_path=source/p243_313.wav \
--old_str='For that reason cover should not be given.' \ --old_str='For that reason cover should not be given' \
--new_str='I love you very much do you love me' \ --new_str='I love you very much do you love me' \
--source_lang=en \ --source_lang=en \
--target_lang=en \ --target_lang=en \
@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \ python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=edit \ --task_name=edit \
--wav_path=source/p243_313.wav \ --wav_path=source/p243_313.wav \
--old_str='For that reason cover should not be given.' \ --old_str='For that reason cover should not be given' \
--new_str='For that reason cover is not impossible to be given.' \ --new_str='For that reason cover is not impossible to be given' \
--source_lang=en \ --source_lang=en \
--target_lang=en \ --target_lang=en \
--erniesat_config=${config_path} \ --erniesat_config=${config_path} \

@ -148,4 +148,4 @@ source path.sh
CUDA_VISIBLE_DEVICES= bash ./local/test.sh ./data sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_1/model/ conf/ecapa_tdnn.yaml CUDA_VISIBLE_DEVICES= bash ./local/test.sh ./data sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_1/model/ conf/ecapa_tdnn.yaml
``` ```
The performance of the released models are shown in [this](./RESULTS.md) The performance of the released models are shown in [this](./RESULT.md)

@ -34,3 +34,15 @@ Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wen
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 | | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 | | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 | | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 |
## Conformer Steaming Pretrained Model
Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz
| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | 16 | 0.056273 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.078918 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 |
| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.054401 |

@ -605,8 +605,8 @@ class U2BaseModel(ASRInterface, nn.Layer):
xs: paddle.Tensor, xs: paddle.Tensor,
offset: int, offset: int,
required_cache_size: int, required_cache_size: int,
att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
""" Export interface for c++ call, give input chunk xs, and return """ Export interface for c++ call, give input chunk xs, and return
output from time 0 to current chunk. output from time 0 to current chunk.

@ -86,7 +86,7 @@ class MultiHeadedAttention(nn.Layer):
self, self,
value: paddle.Tensor, value: paddle.Tensor,
scores: paddle.Tensor, scores: paddle.Tensor,
mask: paddle.Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool) mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool)
) -> paddle.Tensor: ) -> paddle.Tensor:
"""Compute attention context vector. """Compute attention context vector.
Args: Args:
@ -127,14 +127,13 @@ class MultiHeadedAttention(nn.Layer):
return self.linear_out(x) # (batch, time1, d_model) return self.linear_out(x) # (batch, time1, d_model)
def forward( def forward(self,
self,
query: paddle.Tensor, query: paddle.Tensor,
key: paddle.Tensor, key: paddle.Tensor,
value: paddle.Tensor, value: paddle.Tensor,
mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
pos_emb: paddle.Tensor, # paddle.empty([0]) pos_emb: paddle.Tensor=paddle.empty([0]),
cache: paddle.Tensor # paddle.zeros([0,0,0,0]) cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
) -> Tuple[paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute scaled dot product attention. """Compute scaled dot product attention.
Args: Args:
@ -244,14 +243,13 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
return x return x
def forward( def forward(self,
self,
query: paddle.Tensor, query: paddle.Tensor,
key: paddle.Tensor, key: paddle.Tensor,
value: paddle.Tensor, value: paddle.Tensor,
mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
pos_emb: paddle.Tensor, # paddle.empty([0]) pos_emb: paddle.Tensor=paddle.empty([0]),
cache: paddle.Tensor # paddle.zeros([0,0,0,0]) cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
) -> Tuple[paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding. """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Args: Args:

@ -108,8 +108,8 @@ class ConvolutionModule(nn.Layer):
def forward( def forward(
self, self,
x: paddle.Tensor, x: paddle.Tensor,
mask_pad: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
cache: paddle.Tensor # paddle.zeros([0,0,0,0]) cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
) -> Tuple[paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute convolution module. """Compute convolution module.
Args: Args:

@ -121,16 +121,11 @@ class DecoderLayer(nn.Layer):
if self.concat_after: if self.concat_after:
tgt_concat = paddle.cat( tgt_concat = paddle.cat(
(tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask, (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1)
paddle.empty([0]),
paddle.zeros([0, 0, 0, 0]))[0]),
dim=-1)
x = residual + self.concat_linear1(tgt_concat) x = residual + self.concat_linear1(tgt_concat)
else: else:
x = residual + self.dropout( x = residual + self.dropout(
self.self_attn(tgt_q, tgt, tgt, tgt_q_mask, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[
0])
if not self.normalize_before: if not self.normalize_before:
x = self.norm1(x) x = self.norm1(x)
@ -139,15 +134,11 @@ class DecoderLayer(nn.Layer):
x = self.norm2(x) x = self.norm2(x)
if self.concat_after: if self.concat_after:
x_concat = paddle.cat( x_concat = paddle.cat(
(x, self.src_attn(x, memory, memory, memory_mask, (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1)
paddle.empty([0]),
paddle.zeros([0, 0, 0, 0]))[0]),
dim=-1)
x = residual + self.concat_linear2(x_concat) x = residual + self.concat_linear2(x_concat)
else: else:
x = residual + self.dropout( x = residual + self.dropout(
self.src_attn(x, memory, memory, memory_mask, self.src_attn(x, memory, memory, memory_mask)[0])
paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[0])
if not self.normalize_before: if not self.normalize_before:
x = self.norm2(x) x = self.norm2(x)

@ -175,9 +175,7 @@ class BaseEncoder(nn.Layer):
decoding_chunk_size, self.static_chunk_size, decoding_chunk_size, self.static_chunk_size,
num_decoding_left_chunks) num_decoding_left_chunks)
for layer in self.encoders: for layer in self.encoders:
xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad, xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
paddle.zeros([0, 0, 0, 0]),
paddle.zeros([0, 0, 0, 0]))
if self.normalize_before: if self.normalize_before:
xs = self.after_norm(xs) xs = self.after_norm(xs)
# Here we assume the mask is not changed in encoder layers, so just # Here we assume the mask is not changed in encoder layers, so just
@ -190,9 +188,9 @@ class BaseEncoder(nn.Layer):
xs: paddle.Tensor, xs: paddle.Tensor,
offset: int, offset: int,
required_cache_size: int, required_cache_size: int,
att_cache: paddle.Tensor, # paddle.zeros([0,0,0,0]) att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
cnn_cache: paddle.Tensor, # paddle.zeros([0,0,0,0]), cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
att_mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) att_mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool)
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
""" Forward just one chunk """ Forward just one chunk
Args: Args:
@ -255,7 +253,6 @@ class BaseEncoder(nn.Layer):
xs, xs,
att_mask, att_mask,
pos_emb, pos_emb,
mask_pad=paddle.ones([0, 0, 0], dtype=paddle.bool),
att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache,
cnn_cache=cnn_cache[i:i + 1] cnn_cache=cnn_cache[i:i + 1]
if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, ) if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, )
@ -328,8 +325,7 @@ class BaseEncoder(nn.Layer):
chunk_xs = xs[:, cur:end, :] chunk_xs = xs[:, cur:end, :]
(y, att_cache, cnn_cache) = self.forward_chunk( (y, att_cache, cnn_cache) = self.forward_chunk(
chunk_xs, offset, required_cache_size, att_cache, cnn_cache, chunk_xs, offset, required_cache_size, att_cache, cnn_cache)
paddle.ones([0, 0, 0], dtype=paddle.bool))
outputs.append(y) outputs.append(y)
offset += y.shape[1] offset += y.shape[1]

@ -76,10 +76,9 @@ class TransformerEncoderLayer(nn.Layer):
x: paddle.Tensor, x: paddle.Tensor,
mask: paddle.Tensor, mask: paddle.Tensor,
pos_emb: paddle.Tensor, pos_emb: paddle.Tensor,
mask_pad: paddle. mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool) att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Compute encoded features. """Compute encoded features.
Args: Args:
@ -106,8 +105,7 @@ class TransformerEncoderLayer(nn.Layer):
if self.normalize_before: if self.normalize_before:
x = self.norm1(x) x = self.norm1(x)
x_att, new_att_cache = self.self_attn( x_att, new_att_cache = self.self_attn(x, x, x, mask, cache=att_cache)
x, x, x, mask, paddle.empty([0]), cache=att_cache)
if self.concat_after: if self.concat_after:
x_concat = paddle.concat((x, x_att), axis=-1) x_concat = paddle.concat((x, x_att), axis=-1)
@ -195,9 +193,9 @@ class ConformerEncoderLayer(nn.Layer):
x: paddle.Tensor, x: paddle.Tensor,
mask: paddle.Tensor, mask: paddle.Tensor,
pos_emb: paddle.Tensor, pos_emb: paddle.Tensor,
mask_pad: paddle.Tensor, #paddle.ones([0, 0, 0],dtype=paddle.bool) mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Compute encoded features. """Compute encoded features.
Args: Args:

@ -19,6 +19,10 @@ from pathlib import Path
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
world_size = dist.get_world_size()
if world_size > 1:
dist.init_parallel_env()
from visualdl import LogWriter from visualdl import LogWriter
from paddlespeech.s2t.training.reporter import ObsScope from paddlespeech.s2t.training.reporter import ObsScope
@ -122,9 +126,6 @@ class Trainer():
else: else:
raise Exception("invalid device") raise Exception("invalid device")
if self.parallel:
self.init_parallel()
self.checkpoint = Checkpoint( self.checkpoint = Checkpoint(
kbest_n=self.config.checkpoint.kbest_n, kbest_n=self.config.checkpoint.kbest_n,
latest_n=self.config.checkpoint.latest_n) latest_n=self.config.checkpoint.latest_n)
@ -173,11 +174,6 @@ class Trainer():
""" """
return self.args.ngpu > 1 return self.args.ngpu > 1
def init_parallel(self):
"""Init environment for multiprocess training.
"""
dist.init_parallel_env()
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
def save(self, tag=None, infos: dict=None): def save(self, tag=None, infos: dict=None):
"""Save checkpoint (model parameters and optimizer states). """Save checkpoint (model parameters and optimizer states).

@ -480,8 +480,7 @@ class PaddleASRConnectionHanddler:
self.offset, self.offset,
required_cache_size, required_cache_size,
att_cache=self.att_cache, att_cache=self.att_cache,
cnn_cache=self.cnn_cache, cnn_cache=self.cnn_cache)
att_mask=paddle.ones([0, 0, 0], dtype=paddle.bool))
outputs.append(y) outputs.append(y)
# update the global offset, in decoding frame unit # update the global offset, in decoding frame unit

@ -27,8 +27,10 @@ def warm_up(engine_and_type: str, warm_up_time: int=3) -> bool:
sentence = "您好,欢迎使用语音合成服务。" sentence = "您好,欢迎使用语音合成服务。"
elif tts_engine.lang == 'en': elif tts_engine.lang == 'en':
sentence = "Hello and welcome to the speech synthesis service." sentence = "Hello and welcome to the speech synthesis service."
elif tts_engine.lang == 'mix':
sentence = "您好欢迎使用TTS多语种服务。"
else: else:
logger.error("tts engine only support lang: zh or en.") logger.error("tts engine only support lang: zh or en or mix.")
sys.exit(-1) sys.exit(-1)
if engine_and_type == "tts_python": if engine_and_type == "tts_python":

@ -58,7 +58,7 @@ def _readtg(tg_path: str, lang: str='en', fs: int=24000, n_shift: int=300):
durations[-2] += durations[-1] durations[-2] += durations[-1]
durations = durations[:-1] durations = durations[:-1]
# replace ' and 'sil' with 'sp' # replace '' and 'sil' with 'sp'
phones = ['sp' if (phn == '' or phn == 'sil') else phn for phn in phones] phones = ['sp' if (phn == '' or phn == 'sil') else phn for phn in phones]
if lang == 'en': if lang == 'en':
@ -195,7 +195,7 @@ def words2phns(text: str, lang='en'):
wrd = wrd.upper() wrd = wrd.upper()
if (wrd not in ds): if (wrd not in ds):
wrd2phns[str(index) + '_' + wrd] = 'spn' wrd2phns[str(index) + '_' + wrd] = 'spn'
phns.extend('spn') phns.extend(['spn'])
else: else:
wrd2phns[str(index) + '_' + wrd] = word2phns_dict[wrd].split() wrd2phns[str(index) + '_' + wrd] = word2phns_dict[wrd].split()
phns.extend(word2phns_dict[wrd].split()) phns.extend(word2phns_dict[wrd].split())

@ -137,9 +137,6 @@ def prep_feats_with_dur(wav_path: str,
new_wav = np.concatenate( new_wav = np.concatenate(
[wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]]) [wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]])
# 音频是正常遮住了
sf.write(str("mask_wav.wav"), new_wav, samplerate=fs)
# 4. get old and new mel span to be mask # 4. get old and new mel span to be mask
old_span_bdy = get_span_bdy( old_span_bdy = get_span_bdy(
mfa_start=mfa_start, mfa_end=mfa_end, span_to_repl=span_to_repl) mfa_start=mfa_start, mfa_end=mfa_end, span_to_repl=span_to_repl)
@ -274,7 +271,8 @@ def get_wav(wav_path: str,
new_str: str='', new_str: str='',
duration_adjust: bool=True, duration_adjust: bool=True,
fs: int=24000, fs: int=24000,
n_shift: int=300): n_shift: int=300,
task_name: str='synthesize'):
outs = get_mlm_output( outs = get_mlm_output(
wav_path=wav_path, wav_path=wav_path,
@ -298,9 +296,11 @@ def get_wav(wav_path: str,
alt_wav = np.squeeze(alt_wav) alt_wav = np.squeeze(alt_wav)
old_time_bdy = [n_shift * x for x in old_span_bdy] old_time_bdy = [n_shift * x for x in old_span_bdy]
if task_name == 'edit':
wav_replaced = np.concatenate( wav_replaced = np.concatenate(
[wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]]) [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])
else:
wav_replaced = alt_wav
wav_dict = {"origin": wav_org, "output": wav_replaced} wav_dict = {"origin": wav_org, "output": wav_replaced}
return wav_dict return wav_dict
@ -356,7 +356,11 @@ def parse_args():
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
# ernie sat related # ernie sat related
parser.add_argument("--task_name", type=str, help="task name") parser.add_argument(
"--task_name",
type=str,
choices=['edit', 'synthesize'],
help="task name.")
parser.add_argument("--wav_path", type=str, help="path of old wav") parser.add_argument("--wav_path", type=str, help="path of old wav")
parser.add_argument("--old_str", type=str, help="old string") parser.add_argument("--old_str", type=str, help="old string")
parser.add_argument("--new_str", type=str, help="new string") parser.add_argument("--new_str", type=str, help="new string")
@ -410,10 +414,9 @@ if __name__ == '__main__':
if args.task_name == 'edit': if args.task_name == 'edit':
new_str = new_str new_str = new_str
elif args.task_name == 'synthesize': elif args.task_name == 'synthesize':
new_str = old_str + new_str new_str = old_str + ' ' + new_str
else: else:
new_str = old_str + new_str new_str = old_str + ' ' + new_str
print("new_str:", new_str)
# Extractor # Extractor
mel_extractor = LogMelFBank( mel_extractor = LogMelFBank(
@ -467,7 +470,8 @@ if __name__ == '__main__':
new_str=new_str, new_str=new_str,
duration_adjust=args.duration_adjust, duration_adjust=args.duration_adjust,
fs=erniesat_config.fs, fs=erniesat_config.fs,
n_shift=erniesat_config.n_shift) n_shift=erniesat_config.n_shift,
task_name=args.task_name)
sf.write( sf.write(
args.output_name, wav_dict['output'], samplerate=erniesat_config.fs) args.output_name, wav_dict['output'], samplerate=erniesat_config.fs)

@ -1,4 +1,4 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.

@ -30,7 +30,7 @@ class ToneSandhi():
'蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻', '蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻',
'舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂', '舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂',
'胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆', '胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆',
'老头', '老太', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂', '戏弄', '将军', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂',
'精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿', '精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿',
'窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台', '窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台',
'码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算', '码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算',
@ -59,8 +59,7 @@ class ToneSandhi():
'下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨', '父亲', '母亲', '咕噜', '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨', '父亲', '母亲', '咕噜',
'邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', '幸福', '熟悉', '计划', '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', '幸福', '熟悉', '计划',
'扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', '凤凰', '拖沓', '寒碜', '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', '凤凰', '拖沓', '寒碜',
'糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记', '戏弄', '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记'
'将军'
} }
self.must_not_neural_tone_words = { self.must_not_neural_tone_words = {
'男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎', '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',

@ -15,6 +15,7 @@ dataline=$(cat ${FILENAME})
# parser params # parser params
IFS=$'\n' IFS=$'\n'
lines=(${dataline}) lines=(${dataline})
python=python
# The training params # The training params
model_name=$(func_parser_value "${lines[1]}") model_name=$(func_parser_value "${lines[1]}")
@ -68,7 +69,7 @@ if [[ ${MODE} = "benchmark_train" ]];then
if [[ ${model_name} == "pwgan" ]]; then if [[ ${model_name} == "pwgan" ]]; then
# 下载 csmsc 数据集并解压缩 # 下载 csmsc 数据集并解压缩
wget -nc https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar wget -nc https://paddle-wheel.bj.bcebos.com/benchmark/BZNSYP.rar
mkdir -p BZNSYP mkdir -p BZNSYP
unrar x BZNSYP.rar BZNSYP unrar x BZNSYP.rar BZNSYP
wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt
@ -80,6 +81,10 @@ if [[ ${MODE} = "benchmark_train" ]];then
python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy
fi fi
echo "barrier start"
PYTHON="${python}" bash test_tipc/barrier.sh
echo "barrier end"
if [[ ${model_name} == "mdtc" ]]; then if [[ ${model_name} == "mdtc" ]]; then
# 下载 Snips 数据集并解压缩 # 下载 Snips 数据集并解压缩
wget https://paddlespeech.bj.bcebos.com/datasets/hey_snips_kws_4.0.tar.gz.1 wget https://paddlespeech.bj.bcebos.com/datasets/hey_snips_kws_4.0.tar.gz.1

Loading…
Cancel
Save