parent
ffdc17fd97
commit
8636096484
@ -0,0 +1,216 @@
|
||||
epoch
|
||||
iteration
|
||||
main_params
|
||||
main_optimizer
|
||||
spk_embedding_table.weight
|
||||
encoder.embed.0.weight
|
||||
encoder.embed.1.alpha
|
||||
encoder.encoders.0.self_attn.linear_q.weight
|
||||
encoder.encoders.0.self_attn.linear_q.bias
|
||||
encoder.encoders.0.self_attn.linear_k.weight
|
||||
encoder.encoders.0.self_attn.linear_k.bias
|
||||
encoder.encoders.0.self_attn.linear_v.weight
|
||||
encoder.encoders.0.self_attn.linear_v.bias
|
||||
encoder.encoders.0.self_attn.linear_out.weight
|
||||
encoder.encoders.0.self_attn.linear_out.bias
|
||||
encoder.encoders.0.feed_forward.w_1.weight
|
||||
encoder.encoders.0.feed_forward.w_1.bias
|
||||
encoder.encoders.0.feed_forward.w_2.weight
|
||||
encoder.encoders.0.feed_forward.w_2.bias
|
||||
encoder.encoders.0.norm1.weight
|
||||
encoder.encoders.0.norm1.bias
|
||||
encoder.encoders.0.norm2.weight
|
||||
encoder.encoders.0.norm2.bias
|
||||
encoder.encoders.1.self_attn.linear_q.weight
|
||||
encoder.encoders.1.self_attn.linear_q.bias
|
||||
encoder.encoders.1.self_attn.linear_k.weight
|
||||
encoder.encoders.1.self_attn.linear_k.bias
|
||||
encoder.encoders.1.self_attn.linear_v.weight
|
||||
encoder.encoders.1.self_attn.linear_v.bias
|
||||
encoder.encoders.1.self_attn.linear_out.weight
|
||||
encoder.encoders.1.self_attn.linear_out.bias
|
||||
encoder.encoders.1.feed_forward.w_1.weight
|
||||
encoder.encoders.1.feed_forward.w_1.bias
|
||||
encoder.encoders.1.feed_forward.w_2.weight
|
||||
encoder.encoders.1.feed_forward.w_2.bias
|
||||
encoder.encoders.1.norm1.weight
|
||||
encoder.encoders.1.norm1.bias
|
||||
encoder.encoders.1.norm2.weight
|
||||
encoder.encoders.1.norm2.bias
|
||||
encoder.encoders.2.self_attn.linear_q.weight
|
||||
encoder.encoders.2.self_attn.linear_q.bias
|
||||
encoder.encoders.2.self_attn.linear_k.weight
|
||||
encoder.encoders.2.self_attn.linear_k.bias
|
||||
encoder.encoders.2.self_attn.linear_v.weight
|
||||
encoder.encoders.2.self_attn.linear_v.bias
|
||||
encoder.encoders.2.self_attn.linear_out.weight
|
||||
encoder.encoders.2.self_attn.linear_out.bias
|
||||
encoder.encoders.2.feed_forward.w_1.weight
|
||||
encoder.encoders.2.feed_forward.w_1.bias
|
||||
encoder.encoders.2.feed_forward.w_2.weight
|
||||
encoder.encoders.2.feed_forward.w_2.bias
|
||||
encoder.encoders.2.norm1.weight
|
||||
encoder.encoders.2.norm1.bias
|
||||
encoder.encoders.2.norm2.weight
|
||||
encoder.encoders.2.norm2.bias
|
||||
encoder.encoders.3.self_attn.linear_q.weight
|
||||
encoder.encoders.3.self_attn.linear_q.bias
|
||||
encoder.encoders.3.self_attn.linear_k.weight
|
||||
encoder.encoders.3.self_attn.linear_k.bias
|
||||
encoder.encoders.3.self_attn.linear_v.weight
|
||||
encoder.encoders.3.self_attn.linear_v.bias
|
||||
encoder.encoders.3.self_attn.linear_out.weight
|
||||
encoder.encoders.3.self_attn.linear_out.bias
|
||||
encoder.encoders.3.feed_forward.w_1.weight
|
||||
encoder.encoders.3.feed_forward.w_1.bias
|
||||
encoder.encoders.3.feed_forward.w_2.weight
|
||||
encoder.encoders.3.feed_forward.w_2.bias
|
||||
encoder.encoders.3.norm1.weight
|
||||
encoder.encoders.3.norm1.bias
|
||||
encoder.encoders.3.norm2.weight
|
||||
encoder.encoders.3.norm2.bias
|
||||
encoder.after_norm.weight
|
||||
encoder.after_norm.bias
|
||||
spk_projection.weight
|
||||
spk_projection.bias
|
||||
duration_predictor.conv.0.0.weight
|
||||
duration_predictor.conv.0.0.bias
|
||||
duration_predictor.conv.0.2.weight
|
||||
duration_predictor.conv.0.2.bias
|
||||
duration_predictor.conv.1.0.weight
|
||||
duration_predictor.conv.1.0.bias
|
||||
duration_predictor.conv.1.2.weight
|
||||
duration_predictor.conv.1.2.bias
|
||||
duration_predictor.linear.weight
|
||||
duration_predictor.linear.bias
|
||||
pitch_predictor.conv.0.0.weight
|
||||
pitch_predictor.conv.0.0.bias
|
||||
pitch_predictor.conv.0.2.weight
|
||||
pitch_predictor.conv.0.2.bias
|
||||
pitch_predictor.conv.1.0.weight
|
||||
pitch_predictor.conv.1.0.bias
|
||||
pitch_predictor.conv.1.2.weight
|
||||
pitch_predictor.conv.1.2.bias
|
||||
pitch_predictor.conv.2.0.weight
|
||||
pitch_predictor.conv.2.0.bias
|
||||
pitch_predictor.conv.2.2.weight
|
||||
pitch_predictor.conv.2.2.bias
|
||||
pitch_predictor.conv.3.0.weight
|
||||
pitch_predictor.conv.3.0.bias
|
||||
pitch_predictor.conv.3.2.weight
|
||||
pitch_predictor.conv.3.2.bias
|
||||
pitch_predictor.conv.4.0.weight
|
||||
pitch_predictor.conv.4.0.bias
|
||||
pitch_predictor.conv.4.2.weight
|
||||
pitch_predictor.conv.4.2.bias
|
||||
pitch_predictor.linear.weight
|
||||
pitch_predictor.linear.bias
|
||||
pitch_embed.0.weight
|
||||
pitch_embed.0.bias
|
||||
energy_predictor.conv.0.0.weight
|
||||
energy_predictor.conv.0.0.bias
|
||||
energy_predictor.conv.0.2.weight
|
||||
energy_predictor.conv.0.2.bias
|
||||
energy_predictor.conv.1.0.weight
|
||||
energy_predictor.conv.1.0.bias
|
||||
energy_predictor.conv.1.2.weight
|
||||
energy_predictor.conv.1.2.bias
|
||||
energy_predictor.linear.weight
|
||||
energy_predictor.linear.bias
|
||||
energy_embed.0.weight
|
||||
energy_embed.0.bias
|
||||
decoder.embed.0.alpha
|
||||
decoder.encoders.0.self_attn.linear_q.weight
|
||||
decoder.encoders.0.self_attn.linear_q.bias
|
||||
decoder.encoders.0.self_attn.linear_k.weight
|
||||
decoder.encoders.0.self_attn.linear_k.bias
|
||||
decoder.encoders.0.self_attn.linear_v.weight
|
||||
decoder.encoders.0.self_attn.linear_v.bias
|
||||
decoder.encoders.0.self_attn.linear_out.weight
|
||||
decoder.encoders.0.self_attn.linear_out.bias
|
||||
decoder.encoders.0.feed_forward.w_1.weight
|
||||
decoder.encoders.0.feed_forward.w_1.bias
|
||||
decoder.encoders.0.feed_forward.w_2.weight
|
||||
decoder.encoders.0.feed_forward.w_2.bias
|
||||
decoder.encoders.0.norm1.weight
|
||||
decoder.encoders.0.norm1.bias
|
||||
decoder.encoders.0.norm2.weight
|
||||
decoder.encoders.0.norm2.bias
|
||||
decoder.encoders.1.self_attn.linear_q.weight
|
||||
decoder.encoders.1.self_attn.linear_q.bias
|
||||
decoder.encoders.1.self_attn.linear_k.weight
|
||||
decoder.encoders.1.self_attn.linear_k.bias
|
||||
decoder.encoders.1.self_attn.linear_v.weight
|
||||
decoder.encoders.1.self_attn.linear_v.bias
|
||||
decoder.encoders.1.self_attn.linear_out.weight
|
||||
decoder.encoders.1.self_attn.linear_out.bias
|
||||
decoder.encoders.1.feed_forward.w_1.weight
|
||||
decoder.encoders.1.feed_forward.w_1.bias
|
||||
decoder.encoders.1.feed_forward.w_2.weight
|
||||
decoder.encoders.1.feed_forward.w_2.bias
|
||||
decoder.encoders.1.norm1.weight
|
||||
decoder.encoders.1.norm1.bias
|
||||
decoder.encoders.1.norm2.weight
|
||||
decoder.encoders.1.norm2.bias
|
||||
decoder.encoders.2.self_attn.linear_q.weight
|
||||
decoder.encoders.2.self_attn.linear_q.bias
|
||||
decoder.encoders.2.self_attn.linear_k.weight
|
||||
decoder.encoders.2.self_attn.linear_k.bias
|
||||
decoder.encoders.2.self_attn.linear_v.weight
|
||||
decoder.encoders.2.self_attn.linear_v.bias
|
||||
decoder.encoders.2.self_attn.linear_out.weight
|
||||
decoder.encoders.2.self_attn.linear_out.bias
|
||||
decoder.encoders.2.feed_forward.w_1.weight
|
||||
decoder.encoders.2.feed_forward.w_1.bias
|
||||
decoder.encoders.2.feed_forward.w_2.weight
|
||||
decoder.encoders.2.feed_forward.w_2.bias
|
||||
decoder.encoders.2.norm1.weight
|
||||
decoder.encoders.2.norm1.bias
|
||||
decoder.encoders.2.norm2.weight
|
||||
decoder.encoders.2.norm2.bias
|
||||
decoder.encoders.3.self_attn.linear_q.weight
|
||||
decoder.encoders.3.self_attn.linear_q.bias
|
||||
decoder.encoders.3.self_attn.linear_k.weight
|
||||
decoder.encoders.3.self_attn.linear_k.bias
|
||||
decoder.encoders.3.self_attn.linear_v.weight
|
||||
decoder.encoders.3.self_attn.linear_v.bias
|
||||
decoder.encoders.3.self_attn.linear_out.weight
|
||||
decoder.encoders.3.self_attn.linear_out.bias
|
||||
decoder.encoders.3.feed_forward.w_1.weight
|
||||
decoder.encoders.3.feed_forward.w_1.bias
|
||||
decoder.encoders.3.feed_forward.w_2.weight
|
||||
decoder.encoders.3.feed_forward.w_2.bias
|
||||
decoder.encoders.3.norm1.weight
|
||||
decoder.encoders.3.norm1.bias
|
||||
decoder.encoders.3.norm2.weight
|
||||
decoder.encoders.3.norm2.bias
|
||||
decoder.after_norm.weight
|
||||
decoder.after_norm.bias
|
||||
feat_out.weight
|
||||
feat_out.bias
|
||||
postnet.postnet.0.0.weight
|
||||
postnet.postnet.0.1.weight
|
||||
postnet.postnet.0.1.bias
|
||||
postnet.postnet.0.1._mean
|
||||
postnet.postnet.0.1._variance
|
||||
postnet.postnet.1.0.weight
|
||||
postnet.postnet.1.1.weight
|
||||
postnet.postnet.1.1.bias
|
||||
postnet.postnet.1.1._mean
|
||||
postnet.postnet.1.1._variance
|
||||
postnet.postnet.2.0.weight
|
||||
postnet.postnet.2.1.weight
|
||||
postnet.postnet.2.1.bias
|
||||
postnet.postnet.2.1._mean
|
||||
postnet.postnet.2.1._variance
|
||||
postnet.postnet.3.0.weight
|
||||
postnet.postnet.3.1.weight
|
||||
postnet.postnet.3.1.bias
|
||||
postnet.postnet.3.1._mean
|
||||
postnet.postnet.3.1._variance
|
||||
postnet.postnet.4.0.weight
|
||||
postnet.postnet.4.1.weight
|
||||
postnet.postnet.4.1.bias
|
||||
postnet.postnet.4.1._mean
|
||||
postnet.postnet.4.1._variance
|
||||
|
@ -1,214 +0,0 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from typing import Union
|
||||
|
||||
import yaml
|
||||
from local.check_oov import get_check_result
|
||||
from local.extract import extract_feature
|
||||
from local.label_process import get_single_label
|
||||
from local.prepare_env import generate_finetune_env
|
||||
from local.train import train_sp
|
||||
from paddle import distributed as dist
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
|
||||
|
||||
DICT_EN = 'tools/aligner/cmudict-0.7b'
|
||||
DICT_ZH = 'tools/aligner/simple.lexicon'
|
||||
MODEL_DIR_EN = 'tools/aligner/vctk_model.zip'
|
||||
MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip'
|
||||
MFA_PHONE_EN = 'tools/aligner/vctk_model/meta.yaml'
|
||||
MFA_PHONE_ZH = 'tools/aligner/aishell3_model/meta.yaml'
|
||||
MFA_PATH = 'tools/montreal-forced-aligner/bin'
|
||||
os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
|
||||
|
||||
|
||||
class TrainArgs():
|
||||
def __init__(self,
|
||||
ngpu,
|
||||
config_file,
|
||||
dump_dir: Path,
|
||||
output_dir: Path,
|
||||
frozen_layers: List[str]):
|
||||
# config: fastspeech2 config file.
|
||||
self.config = str(config_file)
|
||||
self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl")
|
||||
self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl")
|
||||
# model output dir.
|
||||
self.output_dir = str(output_dir)
|
||||
self.ngpu = ngpu
|
||||
self.phones_dict = str(dump_dir / "phone_id_map.txt")
|
||||
self.speaker_dict = str(dump_dir / "speaker_id_map.txt")
|
||||
self.voice_cloning = False
|
||||
# frozen layers
|
||||
self.frozen_layers = frozen_layers
|
||||
|
||||
|
||||
def get_mfa_result(
|
||||
input_dir: Union[str, Path],
|
||||
mfa_dir: Union[str, Path],
|
||||
lang: str='en', ):
|
||||
"""get mfa result
|
||||
|
||||
Args:
|
||||
input_dir (Union[str, Path]): input dir including wav file and label
|
||||
mfa_dir (Union[str, Path]): mfa result dir
|
||||
lang (str, optional): input audio language. Defaults to 'en'.
|
||||
"""
|
||||
# MFA
|
||||
if lang == 'en':
|
||||
DICT = DICT_EN
|
||||
MODEL_DIR = MODEL_DIR_EN
|
||||
|
||||
elif lang == 'zh':
|
||||
DICT = DICT_ZH
|
||||
MODEL_DIR = MODEL_DIR_ZH
|
||||
else:
|
||||
print('please input right lang!!')
|
||||
|
||||
CMD = 'mfa_align' + ' ' + str(
|
||||
input_dir) + ' ' + DICT + ' ' + MODEL_DIR + ' ' + str(mfa_dir)
|
||||
os.system(CMD)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# parse config and args
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features.")
|
||||
|
||||
parser.add_argument(
|
||||
"--input_dir",
|
||||
type=str,
|
||||
default="./input/baker_mini",
|
||||
help="directory containing audio and label file")
|
||||
|
||||
parser.add_argument(
|
||||
"--pretrained_model_dir",
|
||||
type=str,
|
||||
default="./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0",
|
||||
help="Path to pretrained model")
|
||||
|
||||
parser.add_argument(
|
||||
"--mfa_dir",
|
||||
type=str,
|
||||
default="./mfa_result",
|
||||
help="directory to save aligned files")
|
||||
|
||||
parser.add_argument(
|
||||
"--dump_dir",
|
||||
type=str,
|
||||
default="./dump",
|
||||
help="directory to save feature files and metadata.")
|
||||
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default="./exp/default/",
|
||||
help="directory to save finetune model.")
|
||||
|
||||
parser.add_argument(
|
||||
'--lang',
|
||||
type=str,
|
||||
default='zh',
|
||||
choices=['zh', 'en'],
|
||||
help='Choose input audio language. zh or en')
|
||||
|
||||
parser.add_argument(
|
||||
"--ngpu", type=int, default=2, help="if ngpu=0, use cpu.")
|
||||
|
||||
parser.add_argument("--epoch", type=int, default=100, help="finetune epoch")
|
||||
parser.add_argument(
|
||||
"--finetune_config",
|
||||
type=str,
|
||||
default="./finetune.yaml",
|
||||
help="Path to finetune config file")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
fs = 24000
|
||||
n_shift = 300
|
||||
input_dir = Path(args.input_dir).expanduser()
|
||||
mfa_dir = Path(args.mfa_dir).expanduser()
|
||||
mfa_dir.mkdir(parents=True, exist_ok=True)
|
||||
dump_dir = Path(args.dump_dir).expanduser()
|
||||
dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
output_dir = Path(args.output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
pretrained_model_dir = Path(args.pretrained_model_dir).expanduser()
|
||||
|
||||
# read config
|
||||
config_file = pretrained_model_dir / "default.yaml"
|
||||
with open(config_file) as f:
|
||||
config = CfgNode(yaml.safe_load(f))
|
||||
config.max_epoch = config.max_epoch + args.epoch
|
||||
|
||||
with open(args.finetune_config) as f2:
|
||||
finetune_config = CfgNode(yaml.safe_load(f2))
|
||||
config.batch_size = finetune_config.batch_size if finetune_config.batch_size > 0 else config.batch_size
|
||||
config.optimizer.learning_rate = finetune_config.learning_rate if finetune_config.learning_rate > 0 else config.optimizer.learning_rate
|
||||
config.num_snapshots = finetune_config.num_snapshots if finetune_config.num_snapshots > 0 else config.num_snapshots
|
||||
frozen_layers = finetune_config.frozen_layers
|
||||
assert type(frozen_layers) == list, "frozen_layers should be set a list."
|
||||
|
||||
if args.lang == 'en':
|
||||
lexicon_file = DICT_EN
|
||||
mfa_phone_file = MFA_PHONE_EN
|
||||
elif args.lang == 'zh':
|
||||
lexicon_file = DICT_ZH
|
||||
mfa_phone_file = MFA_PHONE_ZH
|
||||
else:
|
||||
print('please input right lang!!')
|
||||
|
||||
print(f"finetune max_epoch: {config.max_epoch}")
|
||||
print(f"finetune batch_size: {config.batch_size}")
|
||||
print(f"finetune learning_rate: {config.optimizer.learning_rate}")
|
||||
print(f"finetune num_snapshots: {config.num_snapshots}")
|
||||
print(f"finetune frozen_layers: {frozen_layers}")
|
||||
|
||||
am_phone_file = pretrained_model_dir / "phone_id_map.txt"
|
||||
label_file = input_dir / "labels.txt"
|
||||
|
||||
#check phone for mfa and am finetune
|
||||
oov_words, oov_files, oov_file_words = get_check_result(
|
||||
label_file, lexicon_file, mfa_phone_file, am_phone_file)
|
||||
input_dir = get_single_label(label_file, oov_files, input_dir)
|
||||
|
||||
# get mfa result
|
||||
get_mfa_result(input_dir, mfa_dir, args.lang)
|
||||
|
||||
# # generate durations.txt
|
||||
duration_file = "./durations.txt"
|
||||
gen_duration_from_textgrid(mfa_dir, duration_file, fs, n_shift)
|
||||
|
||||
# generate phone and speaker map files
|
||||
extract_feature(duration_file, config, input_dir, dump_dir,
|
||||
pretrained_model_dir)
|
||||
|
||||
# create finetune env
|
||||
generate_finetune_env(output_dir, pretrained_model_dir)
|
||||
|
||||
# create a new args for training
|
||||
train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir,
|
||||
frozen_layers)
|
||||
|
||||
# finetune models
|
||||
# dispatch
|
||||
if args.ngpu > 1:
|
||||
dist.spawn(train_sp, (train_args, config), nprocs=args.ngpu)
|
||||
else:
|
||||
train_sp(train_args, config)
|
@ -0,0 +1,38 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
|
||||
|
||||
if __name__ == '__main__':
|
||||
# parse config and args
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features.")
|
||||
|
||||
parser.add_argument(
|
||||
"--mfa_dir",
|
||||
type=str,
|
||||
default="./mfa_result",
|
||||
help="directory to save aligned files")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
fs = 24000
|
||||
n_shift = 300
|
||||
duration_file = "./durations.txt"
|
||||
mfa_dir = Path(args.mfa_dir).expanduser()
|
||||
mfa_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
gen_duration_from_textgrid(mfa_dir, duration_file, fs, n_shift)
|
@ -0,0 +1,83 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
DICT_EN = 'tools/aligner/cmudict-0.7b'
|
||||
DICT_ZH = 'tools/aligner/simple.lexicon'
|
||||
MODEL_DIR_EN = 'tools/aligner/vctk_model.zip'
|
||||
MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip'
|
||||
MFA_PHONE_EN = 'tools/aligner/vctk_model/meta.yaml'
|
||||
MFA_PHONE_ZH = 'tools/aligner/aishell3_model/meta.yaml'
|
||||
MFA_PATH = 'tools/montreal-forced-aligner/bin'
|
||||
os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
|
||||
|
||||
|
||||
def get_mfa_result(
|
||||
input_dir: Union[str, Path],
|
||||
mfa_dir: Union[str, Path],
|
||||
lang: str='en', ):
|
||||
"""get mfa result
|
||||
|
||||
Args:
|
||||
input_dir (Union[str, Path]): input dir including wav file and label
|
||||
mfa_dir (Union[str, Path]): mfa result dir
|
||||
lang (str, optional): input audio language. Defaults to 'en'.
|
||||
"""
|
||||
# MFA
|
||||
if lang == 'en':
|
||||
DICT = DICT_EN
|
||||
MODEL_DIR = MODEL_DIR_EN
|
||||
|
||||
elif lang == 'zh':
|
||||
DICT = DICT_ZH
|
||||
MODEL_DIR = MODEL_DIR_ZH
|
||||
else:
|
||||
print('please input right lang!!')
|
||||
|
||||
CMD = 'mfa_align' + ' ' + str(
|
||||
input_dir) + ' ' + DICT + ' ' + MODEL_DIR + ' ' + str(mfa_dir)
|
||||
os.system(CMD)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# parse config and args
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features.")
|
||||
|
||||
parser.add_argument(
|
||||
"--input_dir",
|
||||
type=str,
|
||||
default="./input/baker_mini/newdir",
|
||||
help="directory containing audio and label file")
|
||||
|
||||
parser.add_argument(
|
||||
"--mfa_dir",
|
||||
type=str,
|
||||
default="./mfa_result",
|
||||
help="directory to save aligned files")
|
||||
|
||||
parser.add_argument(
|
||||
'--lang',
|
||||
type=str,
|
||||
default='zh',
|
||||
choices=['zh', 'en'],
|
||||
help='Choose input audio language. zh or en')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
get_mfa_result(
|
||||
input_dir=args.input_dir, mfa_dir=args.mfa_dir, lang=args.lang)
|
@ -1,63 +0,0 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from typing import Union
|
||||
|
||||
|
||||
def change_baker_label(baker_label_file: Union[str, Path],
|
||||
out_label_file: Union[str, Path]):
|
||||
"""change baker label file to regular label file
|
||||
|
||||
Args:
|
||||
baker_label_file (Union[str, Path]): Original baker label file
|
||||
out_label_file (Union[str, Path]): regular label file
|
||||
"""
|
||||
with open(baker_label_file) as f:
|
||||
lines = f.readlines()
|
||||
|
||||
with open(out_label_file, "w") as fw:
|
||||
for i in range(0, len(lines), 2):
|
||||
utt_id = lines[i].split()[0]
|
||||
transcription = lines[i + 1].strip()
|
||||
fw.write(utt_id + "|" + transcription + "\n")
|
||||
|
||||
|
||||
def get_single_label(label_file: Union[str, Path],
|
||||
oov_files: List[Union[str, Path]],
|
||||
input_dir: Union[str, Path]):
|
||||
"""Divide the label file into individual files according to label_file
|
||||
|
||||
Args:
|
||||
label_file (str or Path): label file, format: utt_id|phones id
|
||||
input_dir (Path): input dir including audios
|
||||
"""
|
||||
input_dir = Path(input_dir).expanduser()
|
||||
new_dir = input_dir / "newdir"
|
||||
new_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(label_file, "r") as f:
|
||||
for line in f.readlines():
|
||||
utt_id = line.split("|")[0]
|
||||
if utt_id not in oov_files:
|
||||
transcription = line.split("|")[1].strip()
|
||||
wav_file = str(input_dir) + "/" + utt_id + ".wav"
|
||||
new_wav_file = str(new_dir) + "/" + utt_id + ".wav"
|
||||
os.system("cp %s %s" % (wav_file, new_wav_file))
|
||||
single_file = str(new_dir) + "/" + utt_id + ".txt"
|
||||
with open(single_file, "w") as fw:
|
||||
fw.write(transcription)
|
||||
|
||||
return new_dir
|
@ -0,0 +1,107 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
input_dir=./input/ljspeech_mini
|
||||
newdir_name="newdir"
|
||||
new_dir=${input_dir}/${newdir_name}
|
||||
pretrained_model_dir=./pretrained_models/fastspeech2_vctk_ckpt_1.2.0
|
||||
mfa_tools=./tools
|
||||
mfa_dir=./mfa_result
|
||||
dump_dir=./dump
|
||||
output_dir=./exp/default
|
||||
lang=en
|
||||
ngpu=1
|
||||
finetune_config=./conf/finetune.yaml
|
||||
|
||||
ckpt=snapshot_iter_66300
|
||||
|
||||
gpus=1
|
||||
CUDA_VISIBLE_DEVICES=${gpus}
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
# check oov
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
echo "check oov"
|
||||
python3 local/check_oov.py \
|
||||
--input_dir=${input_dir} \
|
||||
--pretrained_model_dir=${pretrained_model_dir} \
|
||||
--newdir_name=${newdir_name} \
|
||||
--lang=${lang}
|
||||
fi
|
||||
|
||||
# get mfa result
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "get mfa result"
|
||||
python3 local/get_mfa_result.py \
|
||||
--input_dir=${new_dir} \
|
||||
--mfa_dir=${mfa_dir} \
|
||||
--lang=${lang}
|
||||
fi
|
||||
|
||||
# generate durations.txt
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
echo "generate durations.txt"
|
||||
python3 local/generate_duration.py \
|
||||
--mfa_dir=${mfa_dir}
|
||||
fi
|
||||
|
||||
# extract feature
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "extract feature"
|
||||
python3 local/extract_feature.py \
|
||||
--duration_file="./durations.txt" \
|
||||
--input_dir=${new_dir} \
|
||||
--dump_dir=${dump_dir} \
|
||||
--pretrained_model_dir=${pretrained_model_dir}
|
||||
fi
|
||||
|
||||
# create finetune env
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
echo "create finetune env"
|
||||
python3 local/prepare_env.py \
|
||||
--pretrained_model_dir=${pretrained_model_dir} \
|
||||
--output_dir=${output_dir}
|
||||
fi
|
||||
|
||||
# finetune
|
||||
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||
echo "finetune..."
|
||||
python3 local/finetune.py \
|
||||
--pretrained_model_dir=${pretrained_model_dir} \
|
||||
--dump_dir=${dump_dir} \
|
||||
--output_dir=${output_dir} \
|
||||
--ngpu=${ngpu} \
|
||||
--epoch=100 \
|
||||
--finetune_config=${finetune_config}
|
||||
fi
|
||||
|
||||
# synthesize e2e
|
||||
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
|
||||
echo "in hifigan syn_e2e"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=fastspeech2_vctk \
|
||||
--am_config=${pretrained_model_dir}/default.yaml \
|
||||
--am_ckpt=${output_dir}/checkpoints/${ckpt}.pdz \
|
||||
--am_stat=${pretrained_model_dir}/speech_stats.npy \
|
||||
--voc=hifigan_vctk \
|
||||
--voc_config=pretrained_models/hifigan_vctk_ckpt_0.2.0/default.yaml \
|
||||
--voc_ckpt=pretrained_models/hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
|
||||
--voc_stat=pretrained_models/hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
|
||||
--lang=en \
|
||||
--text=${BIN_DIR}/../sentences_en.txt \
|
||||
--output_dir=./test_e2e/ \
|
||||
--phones_dict=${dump_dir}/phone_id_map.txt \
|
||||
--speaker_dict=${dump_dir}/speaker_id_map.txt \
|
||||
--spk_id=0
|
||||
fi
|
Loading…
Reference in new issue