commit
66782120b2
Before Width: | Height: | Size: 84 KiB |
@ -1,13 +1,8 @@
|
|||||||
aiofiles
|
aiofiles
|
||||||
faiss-cpu
|
faiss-cpu
|
||||||
fastapi
|
praatio==5.0.0
|
||||||
librosa
|
|
||||||
numpy
|
|
||||||
paddlenlp
|
|
||||||
paddlepaddle
|
|
||||||
paddlespeech
|
|
||||||
pydantic
|
pydantic
|
||||||
python-multipartscikit_learn
|
python-multipart
|
||||||
SoundFile
|
scikit_learn
|
||||||
starlette
|
starlette
|
||||||
uvicorn
|
uvicorn
|
||||||
|
@ -0,0 +1,195 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from .util import MAIN_ROOT
|
||||||
|
from .util import run_cmd
|
||||||
|
|
||||||
|
|
||||||
|
class SAT:
|
||||||
|
def __init__(self):
|
||||||
|
# pretrain model path
|
||||||
|
self.zh_pretrain_model_path = os.path.realpath(
|
||||||
|
"source/model/erniesat_aishell3_ckpt_1.2.0")
|
||||||
|
self.en_pretrain_model_path = os.path.realpath(
|
||||||
|
"source/model/erniesat_vctk_ckpt_1.2.0")
|
||||||
|
self.cross_pretrain_model_path = os.path.realpath(
|
||||||
|
"source/model/erniesat_aishell3_vctk_ckpt_1.2.0")
|
||||||
|
|
||||||
|
self.zh_voc_model_path = os.path.realpath(
|
||||||
|
"source/model/hifigan_aishell3_ckpt_0.2.0")
|
||||||
|
self.eb_voc_model_path = os.path.realpath(
|
||||||
|
"source/model/hifigan_vctk_ckpt_0.2.0")
|
||||||
|
self.cross_voc_model_path = os.path.realpath(
|
||||||
|
"source/model/hifigan_aishell3_ckpt_0.2.0")
|
||||||
|
|
||||||
|
self.BIN_DIR = os.path.join(MAIN_ROOT,
|
||||||
|
"paddlespeech/t2s/exps/ernie_sat")
|
||||||
|
|
||||||
|
def zh_synthesize_edit(self,
|
||||||
|
old_str: str,
|
||||||
|
new_str: str,
|
||||||
|
input_name: os.PathLike,
|
||||||
|
output_name: os.PathLike,
|
||||||
|
task_name: str="synthesize",
|
||||||
|
erniesat_ckpt_name: str="snapshot_iter_289500.pdz"):
|
||||||
|
|
||||||
|
if task_name not in ['synthesize', 'edit']:
|
||||||
|
print("task name only in ['edit', 'synthesize']")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 推理文件配置
|
||||||
|
config_path = os.path.join(self.zh_pretrain_model_path, "default.yaml")
|
||||||
|
phones_dict = os.path.join(self.zh_pretrain_model_path,
|
||||||
|
"phone_id_map.txt")
|
||||||
|
erniesat_ckpt = os.path.join(self.zh_pretrain_model_path,
|
||||||
|
erniesat_ckpt_name)
|
||||||
|
erniesat_stat = os.path.join(self.zh_pretrain_model_path,
|
||||||
|
"speech_stats.npy")
|
||||||
|
|
||||||
|
voc = "hifigan_aishell3"
|
||||||
|
voc_config = os.path.join(self.zh_voc_model_path, "default.yaml")
|
||||||
|
voc_ckpt = os.path.join(self.zh_voc_model_path,
|
||||||
|
"snapshot_iter_2500000.pdz")
|
||||||
|
voc_stat = os.path.join(self.zh_voc_model_path, "feats_stats.npy")
|
||||||
|
|
||||||
|
cmd = self.get_cmd(
|
||||||
|
task_name=task_name,
|
||||||
|
input_name=input_name,
|
||||||
|
old_str=old_str,
|
||||||
|
new_str=new_str,
|
||||||
|
config_path=config_path,
|
||||||
|
phones_dict=phones_dict,
|
||||||
|
erniesat_ckpt=erniesat_ckpt,
|
||||||
|
erniesat_stat=erniesat_stat,
|
||||||
|
voc=voc,
|
||||||
|
voc_config=voc_config,
|
||||||
|
voc_ckpt=voc_ckpt,
|
||||||
|
voc_stat=voc_stat,
|
||||||
|
output_name=output_name,
|
||||||
|
source_lang="zh",
|
||||||
|
target_lang="zh")
|
||||||
|
|
||||||
|
return run_cmd(cmd, output_name)
|
||||||
|
|
||||||
|
def crossclone(self,
|
||||||
|
old_str: str,
|
||||||
|
new_str: str,
|
||||||
|
input_name: os.PathLike,
|
||||||
|
output_name: os.PathLike,
|
||||||
|
source_lang: str,
|
||||||
|
target_lang: str,
|
||||||
|
erniesat_ckpt_name: str="snapshot_iter_489000.pdz"):
|
||||||
|
# 推理文件配置
|
||||||
|
config_path = os.path.join(self.cross_pretrain_model_path,
|
||||||
|
"default.yaml")
|
||||||
|
phones_dict = os.path.join(self.cross_pretrain_model_path,
|
||||||
|
"phone_id_map.txt")
|
||||||
|
erniesat_ckpt = os.path.join(self.cross_pretrain_model_path,
|
||||||
|
erniesat_ckpt_name)
|
||||||
|
erniesat_stat = os.path.join(self.cross_pretrain_model_path,
|
||||||
|
"speech_stats.npy")
|
||||||
|
|
||||||
|
voc = "hifigan_aishell3"
|
||||||
|
voc_config = os.path.join(self.cross_voc_model_path, "default.yaml")
|
||||||
|
voc_ckpt = os.path.join(self.cross_voc_model_path,
|
||||||
|
"snapshot_iter_2500000.pdz")
|
||||||
|
voc_stat = os.path.join(self.cross_voc_model_path, "feats_stats.npy")
|
||||||
|
task_name = "synthesize"
|
||||||
|
cmd = self.get_cmd(
|
||||||
|
task_name=task_name,
|
||||||
|
input_name=input_name,
|
||||||
|
old_str=old_str,
|
||||||
|
new_str=new_str,
|
||||||
|
config_path=config_path,
|
||||||
|
phones_dict=phones_dict,
|
||||||
|
erniesat_ckpt=erniesat_ckpt,
|
||||||
|
erniesat_stat=erniesat_stat,
|
||||||
|
voc=voc,
|
||||||
|
voc_config=voc_config,
|
||||||
|
voc_ckpt=voc_ckpt,
|
||||||
|
voc_stat=voc_stat,
|
||||||
|
output_name=output_name,
|
||||||
|
source_lang=source_lang,
|
||||||
|
target_lang=target_lang)
|
||||||
|
|
||||||
|
return run_cmd(cmd, output_name)
|
||||||
|
|
||||||
|
def en_synthesize_edit(self,
|
||||||
|
old_str: str,
|
||||||
|
new_str: str,
|
||||||
|
input_name: os.PathLike,
|
||||||
|
output_name: os.PathLike,
|
||||||
|
task_name: str="synthesize",
|
||||||
|
erniesat_ckpt_name: str="snapshot_iter_199500.pdz"):
|
||||||
|
|
||||||
|
# 推理文件配置
|
||||||
|
config_path = os.path.join(self.en_pretrain_model_path, "default.yaml")
|
||||||
|
phones_dict = os.path.join(self.en_pretrain_model_path,
|
||||||
|
"phone_id_map.txt")
|
||||||
|
erniesat_ckpt = os.path.join(self.en_pretrain_model_path,
|
||||||
|
erniesat_ckpt_name)
|
||||||
|
erniesat_stat = os.path.join(self.en_pretrain_model_path,
|
||||||
|
"speech_stats.npy")
|
||||||
|
|
||||||
|
voc = "hifigan_aishell3"
|
||||||
|
voc_config = os.path.join(self.zh_voc_model_path, "default.yaml")
|
||||||
|
voc_ckpt = os.path.join(self.zh_voc_model_path,
|
||||||
|
"snapshot_iter_2500000.pdz")
|
||||||
|
voc_stat = os.path.join(self.zh_voc_model_path, "feats_stats.npy")
|
||||||
|
|
||||||
|
cmd = self.get_cmd(
|
||||||
|
task_name=task_name,
|
||||||
|
input_name=input_name,
|
||||||
|
old_str=old_str,
|
||||||
|
new_str=new_str,
|
||||||
|
config_path=config_path,
|
||||||
|
phones_dict=phones_dict,
|
||||||
|
erniesat_ckpt=erniesat_ckpt,
|
||||||
|
erniesat_stat=erniesat_stat,
|
||||||
|
voc=voc,
|
||||||
|
voc_config=voc_config,
|
||||||
|
voc_ckpt=voc_ckpt,
|
||||||
|
voc_stat=voc_stat,
|
||||||
|
output_name=output_name,
|
||||||
|
source_lang="en",
|
||||||
|
target_lang="en")
|
||||||
|
|
||||||
|
return run_cmd(cmd, output_name)
|
||||||
|
|
||||||
|
def get_cmd(self,
|
||||||
|
task_name: str,
|
||||||
|
input_name: str,
|
||||||
|
old_str: str,
|
||||||
|
new_str: str,
|
||||||
|
config_path: str,
|
||||||
|
phones_dict: str,
|
||||||
|
erniesat_ckpt: str,
|
||||||
|
erniesat_stat: str,
|
||||||
|
voc: str,
|
||||||
|
voc_config: str,
|
||||||
|
voc_ckpt: str,
|
||||||
|
voc_stat: str,
|
||||||
|
output_name: str,
|
||||||
|
source_lang: str,
|
||||||
|
target_lang: str):
|
||||||
|
cmd = f"""
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||||
|
python3 {self.BIN_DIR}/synthesize_e2e.py \
|
||||||
|
--task_name={task_name} \
|
||||||
|
--wav_path={input_name} \
|
||||||
|
--old_str='{old_str}' \
|
||||||
|
--new_str='{new_str}' \
|
||||||
|
--source_lang={source_lang} \
|
||||||
|
--target_lang={target_lang} \
|
||||||
|
--erniesat_config={config_path} \
|
||||||
|
--phones_dict={phones_dict} \
|
||||||
|
--erniesat_ckpt={erniesat_ckpt} \
|
||||||
|
--erniesat_stat={erniesat_stat} \
|
||||||
|
--voc={voc} \
|
||||||
|
--voc_config={voc_config} \
|
||||||
|
--voc_ckpt={voc_ckpt} \
|
||||||
|
--voc_stat={voc_stat} \
|
||||||
|
--output_name={output_name}
|
||||||
|
"""
|
||||||
|
|
||||||
|
return cmd
|
@ -0,0 +1,125 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from .util import MAIN_ROOT
|
||||||
|
from .util import run_cmd
|
||||||
|
|
||||||
|
|
||||||
|
def find_max_ckpt(model_path):
|
||||||
|
max_ckpt = 0
|
||||||
|
for filename in os.listdir(model_path):
|
||||||
|
if filename.endswith('.pdz'):
|
||||||
|
files = filename[:-4]
|
||||||
|
a1, a2, it = files.split("_")
|
||||||
|
if int(it) > max_ckpt:
|
||||||
|
max_ckpt = int(it)
|
||||||
|
return max_ckpt
|
||||||
|
|
||||||
|
|
||||||
|
class FineTune:
|
||||||
|
def __init__(self):
|
||||||
|
self.now_file_path = os.path.dirname(__file__)
|
||||||
|
self.PYTHONPATH = os.path.join(MAIN_ROOT,
|
||||||
|
"examples/other/tts_finetune/tts3")
|
||||||
|
self.BIN_DIR = os.path.join(MAIN_ROOT,
|
||||||
|
"paddlespeech/t2s/exps/fastspeech2")
|
||||||
|
self.pretrained_model_dir = os.path.realpath(
|
||||||
|
"source/model/fastspeech2_aishell3_ckpt_1.1.0")
|
||||||
|
self.voc_model_dir = os.path.realpath(
|
||||||
|
"source/model/hifigan_aishell3_ckpt_0.2.0")
|
||||||
|
self.finetune_config = os.path.join("conf/tts3_finetune.yaml")
|
||||||
|
|
||||||
|
def finetune(self, input_dir, exp_dir='temp', epoch=100):
|
||||||
|
"""
|
||||||
|
use cmd follow examples/other/tts_finetune/tts3/run.sh
|
||||||
|
"""
|
||||||
|
newdir_name = "newdir"
|
||||||
|
new_dir = os.path.join(input_dir, newdir_name)
|
||||||
|
mfa_dir = os.path.join(exp_dir, 'mfa_result')
|
||||||
|
dump_dir = os.path.join(exp_dir, 'dump')
|
||||||
|
output_dir = os.path.join(exp_dir, 'exp')
|
||||||
|
lang = "zh"
|
||||||
|
ngpu = 1
|
||||||
|
|
||||||
|
cmd = f"""
|
||||||
|
# check oov
|
||||||
|
python3 {self.PYTHONPATH}/local/check_oov.py \
|
||||||
|
--input_dir={input_dir} \
|
||||||
|
--pretrained_model_dir={self.pretrained_model_dir} \
|
||||||
|
--newdir_name={newdir_name} \
|
||||||
|
--lang={lang}
|
||||||
|
|
||||||
|
# get mfa result
|
||||||
|
python3 {self.PYTHONPATH}/local/get_mfa_result.py \
|
||||||
|
--input_dir={new_dir} \
|
||||||
|
--mfa_dir={mfa_dir} \
|
||||||
|
--lang={lang}
|
||||||
|
|
||||||
|
# generate durations.txt
|
||||||
|
python3 {self.PYTHONPATH}/local/generate_duration.py \
|
||||||
|
--mfa_dir={mfa_dir}
|
||||||
|
|
||||||
|
# extract feature
|
||||||
|
python3 {self.PYTHONPATH}/local/extract_feature.py \
|
||||||
|
--duration_file="./durations.txt" \
|
||||||
|
--input_dir={new_dir} \
|
||||||
|
--dump_dir={dump_dir} \
|
||||||
|
--pretrained_model_dir={self.pretrained_model_dir}
|
||||||
|
|
||||||
|
# create finetune env
|
||||||
|
python3 {self.PYTHONPATH}/local/prepare_env.py \
|
||||||
|
--pretrained_model_dir={self.pretrained_model_dir} \
|
||||||
|
--output_dir={output_dir}
|
||||||
|
|
||||||
|
# finetune
|
||||||
|
python3 {self.PYTHONPATH}/local/finetune.py \
|
||||||
|
--pretrained_model_dir={self.pretrained_model_dir} \
|
||||||
|
--dump_dir={dump_dir} \
|
||||||
|
--output_dir={output_dir} \
|
||||||
|
--ngpu={ngpu} \
|
||||||
|
--epoch=100 \
|
||||||
|
--finetune_config={self.finetune_config}
|
||||||
|
"""
|
||||||
|
|
||||||
|
print(cmd)
|
||||||
|
|
||||||
|
return run_cmd(cmd, exp_dir)
|
||||||
|
|
||||||
|
def synthesize(self, text, wav_name, out_wav_dir, exp_dir='temp'):
|
||||||
|
|
||||||
|
voc = "hifigan_aishell3"
|
||||||
|
dump_dir = os.path.join(exp_dir, 'dump')
|
||||||
|
output_dir = os.path.join(exp_dir, 'exp')
|
||||||
|
text_path = os.path.join(exp_dir, 'sentences.txt')
|
||||||
|
lang = "zh"
|
||||||
|
ngpu = 1
|
||||||
|
|
||||||
|
model_path = f"{output_dir}/checkpoints"
|
||||||
|
ckpt = find_max_ckpt(model_path)
|
||||||
|
|
||||||
|
# 生成对应的语句
|
||||||
|
with open(text_path, "w", encoding='utf8') as f:
|
||||||
|
f.write(wav_name + " " + text)
|
||||||
|
|
||||||
|
cmd = f"""
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||||
|
python3 {self.BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=fastspeech2_aishell3 \
|
||||||
|
--am_config={self.pretrained_model_dir}/default.yaml \
|
||||||
|
--am_ckpt={output_dir}/checkpoints/snapshot_iter_{ckpt}.pdz \
|
||||||
|
--am_stat={self.pretrained_model_dir}/speech_stats.npy \
|
||||||
|
--voc={voc} \
|
||||||
|
--voc_config={self.voc_model_dir}/default.yaml \
|
||||||
|
--voc_ckpt={self.voc_model_dir}/snapshot_iter_2500000.pdz \
|
||||||
|
--voc_stat={self.voc_model_dir}/feats_stats.npy \
|
||||||
|
--lang={lang} \
|
||||||
|
--text={text_path} \
|
||||||
|
--output_dir={out_wav_dir} \
|
||||||
|
--phones_dict={dump_dir}/phone_id_map.txt \
|
||||||
|
--speaker_dict={dump_dir}/speaker_id_map.txt \
|
||||||
|
--spk_id=0
|
||||||
|
"""
|
||||||
|
|
||||||
|
out_path = os.path.join(out_wav_dir, f"{wav_name}.wav")
|
||||||
|
|
||||||
|
return run_cmd(cmd, out_path)
|
@ -0,0 +1,57 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from .util import MAIN_ROOT
|
||||||
|
from .util import run_cmd
|
||||||
|
|
||||||
|
|
||||||
|
class VoiceCloneGE2E():
|
||||||
|
def __init__(self):
|
||||||
|
# Path 到指定路径上
|
||||||
|
self.BIN_DIR = os.path.join(MAIN_ROOT, "paddlespeech/t2s/exps")
|
||||||
|
# am
|
||||||
|
self.am = "fastspeech2_aishell3"
|
||||||
|
self.am_config = "source/model/fastspeech2_nosil_aishell3_vc1_ckpt_0.5/default.yaml"
|
||||||
|
self.am_ckpt = "source/model/fastspeech2_nosil_aishell3_vc1_ckpt_0.5/snapshot_iter_96400.pdz"
|
||||||
|
self.am_stat = "source/model/fastspeech2_nosil_aishell3_vc1_ckpt_0.5/speech_stats.npy"
|
||||||
|
self.phones_dict = "source/model/fastspeech2_nosil_aishell3_vc1_ckpt_0.5/phone_id_map.txt"
|
||||||
|
# voc
|
||||||
|
self.voc = "pwgan_aishell3"
|
||||||
|
self.voc_config = "source/model/pwg_aishell3_ckpt_0.5/default.yaml"
|
||||||
|
self.voc_ckpt = "source/model/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz"
|
||||||
|
self.voc_stat = "source/model/pwg_aishell3_ckpt_0.5/feats_stats.npy"
|
||||||
|
# ge2e
|
||||||
|
self.ge2e_params_path = "source/model/ge2e_ckpt_0.3/step-3000000.pdparams"
|
||||||
|
|
||||||
|
def vc(self, text, input_wav, out_wav):
|
||||||
|
|
||||||
|
# input wav 需要形成临时单独文件夹
|
||||||
|
_, full_file_name = os.path.split(input_wav)
|
||||||
|
ref_audio_dir = os.path.realpath("tmp_dir/ge2e")
|
||||||
|
if os.path.exists(ref_audio_dir):
|
||||||
|
shutil.rmtree(ref_audio_dir)
|
||||||
|
else:
|
||||||
|
os.makedirs(ref_audio_dir, exist_ok=True)
|
||||||
|
shutil.copy(input_wav, ref_audio_dir)
|
||||||
|
|
||||||
|
output_dir = os.path.dirname(out_wav)
|
||||||
|
|
||||||
|
cmd = f"""
|
||||||
|
python3 {self.BIN_DIR}/voice_cloning.py \
|
||||||
|
--am={self.am} \
|
||||||
|
--am_config={self.am_config} \
|
||||||
|
--am_ckpt={self.am_ckpt} \
|
||||||
|
--am_stat={self.am_stat} \
|
||||||
|
--voc={self.voc} \
|
||||||
|
--voc_config={self.voc_config} \
|
||||||
|
--voc_ckpt={self.voc_ckpt} \
|
||||||
|
--voc_stat={self.voc_stat} \
|
||||||
|
--ge2e_params_path={self.ge2e_params_path} \
|
||||||
|
--text="{text}" \
|
||||||
|
--input-dir={ref_audio_dir} \
|
||||||
|
--output-dir={output_dir} \
|
||||||
|
--phones-dict={self.phones_dict}
|
||||||
|
"""
|
||||||
|
|
||||||
|
output_name = os.path.join(output_dir, full_file_name)
|
||||||
|
return run_cmd(cmd, output_name=output_name)
|
@ -0,0 +1,54 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from .util import MAIN_ROOT
|
||||||
|
from .util import run_cmd
|
||||||
|
|
||||||
|
|
||||||
|
class VoiceCloneTDNN():
|
||||||
|
def __init__(self):
|
||||||
|
# Path 到指定路径上
|
||||||
|
self.BIN_DIR = os.path.join(MAIN_ROOT, "paddlespeech/t2s/exps")
|
||||||
|
|
||||||
|
self.am = "fastspeech2_aishell3"
|
||||||
|
self.am_config = "source/model/fastspeech2_aishell3_ckpt_vc2_1.2.0/default.yaml"
|
||||||
|
self.am_ckpt = "source/model/fastspeech2_aishell3_ckpt_vc2_1.2.0/snapshot_iter_96400.pdz"
|
||||||
|
self.am_stat = "source/model/fastspeech2_aishell3_ckpt_vc2_1.2.0/speech_stats.npy"
|
||||||
|
self.phones_dict = "source/model/fastspeech2_aishell3_ckpt_vc2_1.2.0/phone_id_map.txt"
|
||||||
|
# voc
|
||||||
|
self.voc = "pwgan_aishell3"
|
||||||
|
self.voc_config = "source/model/pwg_aishell3_ckpt_0.5/default.yaml"
|
||||||
|
self.voc_ckpt = "source/model/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz"
|
||||||
|
self.voc_stat = "source/model/pwg_aishell3_ckpt_0.5/feats_stats.npy"
|
||||||
|
|
||||||
|
def vc(self, text, input_wav, out_wav):
|
||||||
|
# input wav 需要形成临时单独文件夹
|
||||||
|
_, full_file_name = os.path.split(input_wav)
|
||||||
|
ref_audio_dir = os.path.realpath("tmp_dir/tdnn")
|
||||||
|
if os.path.exists(ref_audio_dir):
|
||||||
|
shutil.rmtree(ref_audio_dir)
|
||||||
|
else:
|
||||||
|
os.makedirs(ref_audio_dir, exist_ok=True)
|
||||||
|
shutil.copy(input_wav, ref_audio_dir)
|
||||||
|
|
||||||
|
output_dir = os.path.dirname(out_wav)
|
||||||
|
|
||||||
|
cmd = f"""
|
||||||
|
python3 {self.BIN_DIR}/voice_cloning.py \
|
||||||
|
--am={self.am} \
|
||||||
|
--am_config={self.am_config} \
|
||||||
|
--am_ckpt={self.am_ckpt} \
|
||||||
|
--am_stat={self.am_stat} \
|
||||||
|
--voc={self.voc} \
|
||||||
|
--voc_config={self.voc_config} \
|
||||||
|
--voc_ckpt={self.voc_ckpt} \
|
||||||
|
--voc_stat={self.voc_stat} \
|
||||||
|
--text="{text}" \
|
||||||
|
--input-dir={ref_audio_dir} \
|
||||||
|
--output-dir={output_dir} \
|
||||||
|
--phones-dict={self.phones_dict} \
|
||||||
|
--use_ecapa=True
|
||||||
|
"""
|
||||||
|
|
||||||
|
output_name = os.path.join(output_dir, full_file_name)
|
||||||
|
return run_cmd(cmd, output_name=output_name)
|
@ -0,0 +1,88 @@
|
|||||||
|
import axios from 'axios'
|
||||||
|
import {apiURL} from "./API.js"
|
||||||
|
|
||||||
|
// 上传音频-vc
|
||||||
|
export async function vcUpload(params){
|
||||||
|
const result = await axios.post(apiURL.VC_Upload, params);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// 上传音频-sat
|
||||||
|
export async function satUpload(params){
|
||||||
|
const result = await axios.post(apiURL.SAT_Upload, params);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// 上传音频-finetune
|
||||||
|
export async function fineTuneUpload(params){
|
||||||
|
const result = await axios.post(apiURL.FineTune_Upload, params);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// 删除音频
|
||||||
|
export async function vcDel(params){
|
||||||
|
const result = await axios.post(apiURL.VC_Del, params);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取音频列表vc
|
||||||
|
export async function vcList(){
|
||||||
|
const result = await axios.get(apiURL.VC_List);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
// 获取音频列表Sat
|
||||||
|
export async function satList(){
|
||||||
|
const result = await axios.get(apiURL.SAT_List);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取音频列表fineTune
|
||||||
|
export async function fineTuneList(params){
|
||||||
|
const result = await axios.post(apiURL.FineTune_List, params);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// fineTune 一键重置 获取新的文件夹
|
||||||
|
export async function fineTuneNewDir(){
|
||||||
|
const result = await axios.get(apiURL.FineTune_NewDir);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取音频数据
|
||||||
|
export async function vcDownload(params){
|
||||||
|
const result = await axios.post(apiURL.VC_Download, params);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取音频数据Base64
|
||||||
|
export async function vcDownloadBase64(params){
|
||||||
|
const result = await axios.post(apiURL.VC_Download_Base64, params);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// 克隆合成G2P
|
||||||
|
export async function vcCloneG2P(params){
|
||||||
|
const result = await axios.post(apiURL.VC_CloneG2p, params);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// 克隆合成SAT
|
||||||
|
export async function vcCloneSAT(params){
|
||||||
|
const result = await axios.post(apiURL.VC_CloneSAT, params);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// 克隆合成 - finetune 微调
|
||||||
|
export async function vcCloneFineTune(params){
|
||||||
|
const result = await axios.post(apiURL.VC_CloneFineTune, params);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// 克隆合成 - finetune 合成
|
||||||
|
export async function vcCloneFineTuneSyn(params){
|
||||||
|
const result = await axios.post(apiURL.VC_CloneFineTuneSyn, params);
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.cls.exps.panns.deploy.predict module
|
|
||||||
=================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.cls.exps.panns.deploy.predict
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.cls.exps.panns.export\_model module
|
|
||||||
================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.cls.exps.panns.export_model
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.cls.exps.panns.predict module
|
|
||||||
==========================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.cls.exps.panns.predict
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.cls.exps.panns.train module
|
|
||||||
========================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.cls.exps.panns.train
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.kws.exps.mdtc.plot\_det\_curve module
|
|
||||||
==================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.decoders.ctcdecoder.scorer\_deprecated module
|
|
||||||
==============================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.decoders.recog\_bin module
|
|
||||||
===========================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.decoders.recog_bin
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.decoders.scorers.ngram module
|
|
||||||
==============================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.decoders.scorers.ngram
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.exps.deepspeech2.bin.deploy.client module
|
|
||||||
==========================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.exps.deepspeech2.bin.deploy.client
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.exps.deepspeech2.bin.deploy.record module
|
|
||||||
==========================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.exps.deepspeech2.bin.deploy.record
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.exps.deepspeech2.bin.deploy.send module
|
|
||||||
========================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.exps.deepspeech2.bin.deploy.send
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.exps.u2.trainer module
|
|
||||||
=======================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.exps.u2.trainer
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.exps.u2\_kaldi.bin.recog module
|
|
||||||
================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.exps.u2_kaldi.bin.recog
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.training.extensions.snapshot module
|
|
||||||
====================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.training.extensions.snapshot
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.training.extensions.visualizer module
|
|
||||||
======================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.training.extensions.visualizer
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.training.updaters.trainer module
|
|
||||||
=================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.training.updaters.trainer
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.transform.add\_deltas module
|
|
||||||
=============================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.transform.add_deltas
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.transform.channel\_selector module
|
|
||||||
===================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.transform.channel_selector
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.transform.cmvn module
|
|
||||||
======================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.transform.cmvn
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.transform.functional module
|
|
||||||
============================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.transform.functional
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.transform.perturb module
|
|
||||||
=========================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.transform.perturb
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,24 +0,0 @@
|
|||||||
paddlespeech.s2t.transform package
|
|
||||||
==================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.transform
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
Submodules
|
|
||||||
----------
|
|
||||||
|
|
||||||
.. toctree::
|
|
||||||
:maxdepth: 4
|
|
||||||
|
|
||||||
paddlespeech.s2t.transform.add_deltas
|
|
||||||
paddlespeech.s2t.transform.channel_selector
|
|
||||||
paddlespeech.s2t.transform.cmvn
|
|
||||||
paddlespeech.s2t.transform.functional
|
|
||||||
paddlespeech.s2t.transform.perturb
|
|
||||||
paddlespeech.s2t.transform.spec_augment
|
|
||||||
paddlespeech.s2t.transform.spectrogram
|
|
||||||
paddlespeech.s2t.transform.transform_interface
|
|
||||||
paddlespeech.s2t.transform.transformation
|
|
||||||
paddlespeech.s2t.transform.wpe
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.transform.spec\_augment module
|
|
||||||
===============================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.transform.spec_augment
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.transform.spectrogram module
|
|
||||||
=============================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.transform.spectrogram
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.transform.transform\_interface module
|
|
||||||
======================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.transform.transform_interface
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.transform.transformation module
|
|
||||||
================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.transform.transformation
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.s2t.transform.wpe module
|
|
||||||
=====================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.s2t.transform.wpe
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.server.engine.acs.python.acs\_engine module
|
|
||||||
========================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.server.engine.acs.python.acs_engine
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.server.utils.log module
|
|
||||||
====================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.server.utils.log
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.t2s.exps.stream\_play\_tts module
|
|
||||||
==============================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.t2s.exps.stream_play_tts
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.t2s.models.ernie\_sat.mlm module
|
|
||||||
=============================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.t2s.models.ernie_sat.mlm
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.t2s.models.vits.monotonic\_align.core module
|
|
||||||
=========================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
@ -1,16 +0,0 @@
|
|||||||
paddlespeech.t2s.models.vits.monotonic\_align package
|
|
||||||
=====================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
Submodules
|
|
||||||
----------
|
|
||||||
|
|
||||||
.. toctree::
|
|
||||||
:maxdepth: 4
|
|
||||||
|
|
||||||
paddlespeech.t2s.models.vits.monotonic_align.core
|
|
||||||
paddlespeech.t2s.models.vits.monotonic_align.setup
|
|
@ -1,7 +0,0 @@
|
|||||||
paddlespeech.t2s.models.vits.monotonic\_align.setup module
|
|
||||||
==========================================================
|
|
||||||
|
|
||||||
.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,216 @@
|
|||||||
|
epoch
|
||||||
|
iteration
|
||||||
|
main_params
|
||||||
|
main_optimizer
|
||||||
|
spk_embedding_table.weight
|
||||||
|
encoder.embed.0.weight
|
||||||
|
encoder.embed.1.alpha
|
||||||
|
encoder.encoders.0.self_attn.linear_q.weight
|
||||||
|
encoder.encoders.0.self_attn.linear_q.bias
|
||||||
|
encoder.encoders.0.self_attn.linear_k.weight
|
||||||
|
encoder.encoders.0.self_attn.linear_k.bias
|
||||||
|
encoder.encoders.0.self_attn.linear_v.weight
|
||||||
|
encoder.encoders.0.self_attn.linear_v.bias
|
||||||
|
encoder.encoders.0.self_attn.linear_out.weight
|
||||||
|
encoder.encoders.0.self_attn.linear_out.bias
|
||||||
|
encoder.encoders.0.feed_forward.w_1.weight
|
||||||
|
encoder.encoders.0.feed_forward.w_1.bias
|
||||||
|
encoder.encoders.0.feed_forward.w_2.weight
|
||||||
|
encoder.encoders.0.feed_forward.w_2.bias
|
||||||
|
encoder.encoders.0.norm1.weight
|
||||||
|
encoder.encoders.0.norm1.bias
|
||||||
|
encoder.encoders.0.norm2.weight
|
||||||
|
encoder.encoders.0.norm2.bias
|
||||||
|
encoder.encoders.1.self_attn.linear_q.weight
|
||||||
|
encoder.encoders.1.self_attn.linear_q.bias
|
||||||
|
encoder.encoders.1.self_attn.linear_k.weight
|
||||||
|
encoder.encoders.1.self_attn.linear_k.bias
|
||||||
|
encoder.encoders.1.self_attn.linear_v.weight
|
||||||
|
encoder.encoders.1.self_attn.linear_v.bias
|
||||||
|
encoder.encoders.1.self_attn.linear_out.weight
|
||||||
|
encoder.encoders.1.self_attn.linear_out.bias
|
||||||
|
encoder.encoders.1.feed_forward.w_1.weight
|
||||||
|
encoder.encoders.1.feed_forward.w_1.bias
|
||||||
|
encoder.encoders.1.feed_forward.w_2.weight
|
||||||
|
encoder.encoders.1.feed_forward.w_2.bias
|
||||||
|
encoder.encoders.1.norm1.weight
|
||||||
|
encoder.encoders.1.norm1.bias
|
||||||
|
encoder.encoders.1.norm2.weight
|
||||||
|
encoder.encoders.1.norm2.bias
|
||||||
|
encoder.encoders.2.self_attn.linear_q.weight
|
||||||
|
encoder.encoders.2.self_attn.linear_q.bias
|
||||||
|
encoder.encoders.2.self_attn.linear_k.weight
|
||||||
|
encoder.encoders.2.self_attn.linear_k.bias
|
||||||
|
encoder.encoders.2.self_attn.linear_v.weight
|
||||||
|
encoder.encoders.2.self_attn.linear_v.bias
|
||||||
|
encoder.encoders.2.self_attn.linear_out.weight
|
||||||
|
encoder.encoders.2.self_attn.linear_out.bias
|
||||||
|
encoder.encoders.2.feed_forward.w_1.weight
|
||||||
|
encoder.encoders.2.feed_forward.w_1.bias
|
||||||
|
encoder.encoders.2.feed_forward.w_2.weight
|
||||||
|
encoder.encoders.2.feed_forward.w_2.bias
|
||||||
|
encoder.encoders.2.norm1.weight
|
||||||
|
encoder.encoders.2.norm1.bias
|
||||||
|
encoder.encoders.2.norm2.weight
|
||||||
|
encoder.encoders.2.norm2.bias
|
||||||
|
encoder.encoders.3.self_attn.linear_q.weight
|
||||||
|
encoder.encoders.3.self_attn.linear_q.bias
|
||||||
|
encoder.encoders.3.self_attn.linear_k.weight
|
||||||
|
encoder.encoders.3.self_attn.linear_k.bias
|
||||||
|
encoder.encoders.3.self_attn.linear_v.weight
|
||||||
|
encoder.encoders.3.self_attn.linear_v.bias
|
||||||
|
encoder.encoders.3.self_attn.linear_out.weight
|
||||||
|
encoder.encoders.3.self_attn.linear_out.bias
|
||||||
|
encoder.encoders.3.feed_forward.w_1.weight
|
||||||
|
encoder.encoders.3.feed_forward.w_1.bias
|
||||||
|
encoder.encoders.3.feed_forward.w_2.weight
|
||||||
|
encoder.encoders.3.feed_forward.w_2.bias
|
||||||
|
encoder.encoders.3.norm1.weight
|
||||||
|
encoder.encoders.3.norm1.bias
|
||||||
|
encoder.encoders.3.norm2.weight
|
||||||
|
encoder.encoders.3.norm2.bias
|
||||||
|
encoder.after_norm.weight
|
||||||
|
encoder.after_norm.bias
|
||||||
|
spk_projection.weight
|
||||||
|
spk_projection.bias
|
||||||
|
duration_predictor.conv.0.0.weight
|
||||||
|
duration_predictor.conv.0.0.bias
|
||||||
|
duration_predictor.conv.0.2.weight
|
||||||
|
duration_predictor.conv.0.2.bias
|
||||||
|
duration_predictor.conv.1.0.weight
|
||||||
|
duration_predictor.conv.1.0.bias
|
||||||
|
duration_predictor.conv.1.2.weight
|
||||||
|
duration_predictor.conv.1.2.bias
|
||||||
|
duration_predictor.linear.weight
|
||||||
|
duration_predictor.linear.bias
|
||||||
|
pitch_predictor.conv.0.0.weight
|
||||||
|
pitch_predictor.conv.0.0.bias
|
||||||
|
pitch_predictor.conv.0.2.weight
|
||||||
|
pitch_predictor.conv.0.2.bias
|
||||||
|
pitch_predictor.conv.1.0.weight
|
||||||
|
pitch_predictor.conv.1.0.bias
|
||||||
|
pitch_predictor.conv.1.2.weight
|
||||||
|
pitch_predictor.conv.1.2.bias
|
||||||
|
pitch_predictor.conv.2.0.weight
|
||||||
|
pitch_predictor.conv.2.0.bias
|
||||||
|
pitch_predictor.conv.2.2.weight
|
||||||
|
pitch_predictor.conv.2.2.bias
|
||||||
|
pitch_predictor.conv.3.0.weight
|
||||||
|
pitch_predictor.conv.3.0.bias
|
||||||
|
pitch_predictor.conv.3.2.weight
|
||||||
|
pitch_predictor.conv.3.2.bias
|
||||||
|
pitch_predictor.conv.4.0.weight
|
||||||
|
pitch_predictor.conv.4.0.bias
|
||||||
|
pitch_predictor.conv.4.2.weight
|
||||||
|
pitch_predictor.conv.4.2.bias
|
||||||
|
pitch_predictor.linear.weight
|
||||||
|
pitch_predictor.linear.bias
|
||||||
|
pitch_embed.0.weight
|
||||||
|
pitch_embed.0.bias
|
||||||
|
energy_predictor.conv.0.0.weight
|
||||||
|
energy_predictor.conv.0.0.bias
|
||||||
|
energy_predictor.conv.0.2.weight
|
||||||
|
energy_predictor.conv.0.2.bias
|
||||||
|
energy_predictor.conv.1.0.weight
|
||||||
|
energy_predictor.conv.1.0.bias
|
||||||
|
energy_predictor.conv.1.2.weight
|
||||||
|
energy_predictor.conv.1.2.bias
|
||||||
|
energy_predictor.linear.weight
|
||||||
|
energy_predictor.linear.bias
|
||||||
|
energy_embed.0.weight
|
||||||
|
energy_embed.0.bias
|
||||||
|
decoder.embed.0.alpha
|
||||||
|
decoder.encoders.0.self_attn.linear_q.weight
|
||||||
|
decoder.encoders.0.self_attn.linear_q.bias
|
||||||
|
decoder.encoders.0.self_attn.linear_k.weight
|
||||||
|
decoder.encoders.0.self_attn.linear_k.bias
|
||||||
|
decoder.encoders.0.self_attn.linear_v.weight
|
||||||
|
decoder.encoders.0.self_attn.linear_v.bias
|
||||||
|
decoder.encoders.0.self_attn.linear_out.weight
|
||||||
|
decoder.encoders.0.self_attn.linear_out.bias
|
||||||
|
decoder.encoders.0.feed_forward.w_1.weight
|
||||||
|
decoder.encoders.0.feed_forward.w_1.bias
|
||||||
|
decoder.encoders.0.feed_forward.w_2.weight
|
||||||
|
decoder.encoders.0.feed_forward.w_2.bias
|
||||||
|
decoder.encoders.0.norm1.weight
|
||||||
|
decoder.encoders.0.norm1.bias
|
||||||
|
decoder.encoders.0.norm2.weight
|
||||||
|
decoder.encoders.0.norm2.bias
|
||||||
|
decoder.encoders.1.self_attn.linear_q.weight
|
||||||
|
decoder.encoders.1.self_attn.linear_q.bias
|
||||||
|
decoder.encoders.1.self_attn.linear_k.weight
|
||||||
|
decoder.encoders.1.self_attn.linear_k.bias
|
||||||
|
decoder.encoders.1.self_attn.linear_v.weight
|
||||||
|
decoder.encoders.1.self_attn.linear_v.bias
|
||||||
|
decoder.encoders.1.self_attn.linear_out.weight
|
||||||
|
decoder.encoders.1.self_attn.linear_out.bias
|
||||||
|
decoder.encoders.1.feed_forward.w_1.weight
|
||||||
|
decoder.encoders.1.feed_forward.w_1.bias
|
||||||
|
decoder.encoders.1.feed_forward.w_2.weight
|
||||||
|
decoder.encoders.1.feed_forward.w_2.bias
|
||||||
|
decoder.encoders.1.norm1.weight
|
||||||
|
decoder.encoders.1.norm1.bias
|
||||||
|
decoder.encoders.1.norm2.weight
|
||||||
|
decoder.encoders.1.norm2.bias
|
||||||
|
decoder.encoders.2.self_attn.linear_q.weight
|
||||||
|
decoder.encoders.2.self_attn.linear_q.bias
|
||||||
|
decoder.encoders.2.self_attn.linear_k.weight
|
||||||
|
decoder.encoders.2.self_attn.linear_k.bias
|
||||||
|
decoder.encoders.2.self_attn.linear_v.weight
|
||||||
|
decoder.encoders.2.self_attn.linear_v.bias
|
||||||
|
decoder.encoders.2.self_attn.linear_out.weight
|
||||||
|
decoder.encoders.2.self_attn.linear_out.bias
|
||||||
|
decoder.encoders.2.feed_forward.w_1.weight
|
||||||
|
decoder.encoders.2.feed_forward.w_1.bias
|
||||||
|
decoder.encoders.2.feed_forward.w_2.weight
|
||||||
|
decoder.encoders.2.feed_forward.w_2.bias
|
||||||
|
decoder.encoders.2.norm1.weight
|
||||||
|
decoder.encoders.2.norm1.bias
|
||||||
|
decoder.encoders.2.norm2.weight
|
||||||
|
decoder.encoders.2.norm2.bias
|
||||||
|
decoder.encoders.3.self_attn.linear_q.weight
|
||||||
|
decoder.encoders.3.self_attn.linear_q.bias
|
||||||
|
decoder.encoders.3.self_attn.linear_k.weight
|
||||||
|
decoder.encoders.3.self_attn.linear_k.bias
|
||||||
|
decoder.encoders.3.self_attn.linear_v.weight
|
||||||
|
decoder.encoders.3.self_attn.linear_v.bias
|
||||||
|
decoder.encoders.3.self_attn.linear_out.weight
|
||||||
|
decoder.encoders.3.self_attn.linear_out.bias
|
||||||
|
decoder.encoders.3.feed_forward.w_1.weight
|
||||||
|
decoder.encoders.3.feed_forward.w_1.bias
|
||||||
|
decoder.encoders.3.feed_forward.w_2.weight
|
||||||
|
decoder.encoders.3.feed_forward.w_2.bias
|
||||||
|
decoder.encoders.3.norm1.weight
|
||||||
|
decoder.encoders.3.norm1.bias
|
||||||
|
decoder.encoders.3.norm2.weight
|
||||||
|
decoder.encoders.3.norm2.bias
|
||||||
|
decoder.after_norm.weight
|
||||||
|
decoder.after_norm.bias
|
||||||
|
feat_out.weight
|
||||||
|
feat_out.bias
|
||||||
|
postnet.postnet.0.0.weight
|
||||||
|
postnet.postnet.0.1.weight
|
||||||
|
postnet.postnet.0.1.bias
|
||||||
|
postnet.postnet.0.1._mean
|
||||||
|
postnet.postnet.0.1._variance
|
||||||
|
postnet.postnet.1.0.weight
|
||||||
|
postnet.postnet.1.1.weight
|
||||||
|
postnet.postnet.1.1.bias
|
||||||
|
postnet.postnet.1.1._mean
|
||||||
|
postnet.postnet.1.1._variance
|
||||||
|
postnet.postnet.2.0.weight
|
||||||
|
postnet.postnet.2.1.weight
|
||||||
|
postnet.postnet.2.1.bias
|
||||||
|
postnet.postnet.2.1._mean
|
||||||
|
postnet.postnet.2.1._variance
|
||||||
|
postnet.postnet.3.0.weight
|
||||||
|
postnet.postnet.3.1.weight
|
||||||
|
postnet.postnet.3.1.bias
|
||||||
|
postnet.postnet.3.1._mean
|
||||||
|
postnet.postnet.3.1._variance
|
||||||
|
postnet.postnet.4.0.weight
|
||||||
|
postnet.postnet.4.1.weight
|
||||||
|
postnet.postnet.4.1.bias
|
||||||
|
postnet.postnet.4.1._mean
|
||||||
|
postnet.postnet.4.1._variance
|
||||||
|
|
@ -0,0 +1,14 @@
|
|||||||
|
###########################################################
|
||||||
|
# PARAS SETTING #
|
||||||
|
###########################################################
|
||||||
|
# Set to -1 to indicate that the parameter is the same as the pretrained model configuration
|
||||||
|
|
||||||
|
batch_size: -1
|
||||||
|
learning_rate: 0.0001 # learning rate
|
||||||
|
num_snapshots: -1
|
||||||
|
|
||||||
|
# frozen_layers should be a list
|
||||||
|
# if you don't need to freeze, set frozen_layers to []
|
||||||
|
# fastspeech2 layers can be found on conf/fastspeech2_layers.txt
|
||||||
|
# example: frozen_layers: ["encoder", "duration_predictor"]
|
||||||
|
frozen_layers: ["encoder"]
|
@ -1,214 +0,0 @@
|
|||||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import List
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
import yaml
|
|
||||||
from local.check_oov import get_check_result
|
|
||||||
from local.extract import extract_feature
|
|
||||||
from local.label_process import get_single_label
|
|
||||||
from local.prepare_env import generate_finetune_env
|
|
||||||
from local.train import train_sp
|
|
||||||
from paddle import distributed as dist
|
|
||||||
from yacs.config import CfgNode
|
|
||||||
|
|
||||||
from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
|
|
||||||
|
|
||||||
DICT_EN = 'tools/aligner/cmudict-0.7b'
|
|
||||||
DICT_ZH = 'tools/aligner/simple.lexicon'
|
|
||||||
MODEL_DIR_EN = 'tools/aligner/vctk_model.zip'
|
|
||||||
MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip'
|
|
||||||
MFA_PHONE_EN = 'tools/aligner/vctk_model/meta.yaml'
|
|
||||||
MFA_PHONE_ZH = 'tools/aligner/aishell3_model/meta.yaml'
|
|
||||||
MFA_PATH = 'tools/montreal-forced-aligner/bin'
|
|
||||||
os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
|
|
||||||
|
|
||||||
|
|
||||||
class TrainArgs():
|
|
||||||
def __init__(self,
|
|
||||||
ngpu,
|
|
||||||
config_file,
|
|
||||||
dump_dir: Path,
|
|
||||||
output_dir: Path,
|
|
||||||
frozen_layers: List[str]):
|
|
||||||
# config: fastspeech2 config file.
|
|
||||||
self.config = str(config_file)
|
|
||||||
self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl")
|
|
||||||
self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl")
|
|
||||||
# model output dir.
|
|
||||||
self.output_dir = str(output_dir)
|
|
||||||
self.ngpu = ngpu
|
|
||||||
self.phones_dict = str(dump_dir / "phone_id_map.txt")
|
|
||||||
self.speaker_dict = str(dump_dir / "speaker_id_map.txt")
|
|
||||||
self.voice_cloning = False
|
|
||||||
# frozen layers
|
|
||||||
self.frozen_layers = frozen_layers
|
|
||||||
|
|
||||||
|
|
||||||
def get_mfa_result(
|
|
||||||
input_dir: Union[str, Path],
|
|
||||||
mfa_dir: Union[str, Path],
|
|
||||||
lang: str='en', ):
|
|
||||||
"""get mfa result
|
|
||||||
|
|
||||||
Args:
|
|
||||||
input_dir (Union[str, Path]): input dir including wav file and label
|
|
||||||
mfa_dir (Union[str, Path]): mfa result dir
|
|
||||||
lang (str, optional): input audio language. Defaults to 'en'.
|
|
||||||
"""
|
|
||||||
# MFA
|
|
||||||
if lang == 'en':
|
|
||||||
DICT = DICT_EN
|
|
||||||
MODEL_DIR = MODEL_DIR_EN
|
|
||||||
|
|
||||||
elif lang == 'zh':
|
|
||||||
DICT = DICT_ZH
|
|
||||||
MODEL_DIR = MODEL_DIR_ZH
|
|
||||||
else:
|
|
||||||
print('please input right lang!!')
|
|
||||||
|
|
||||||
CMD = 'mfa_align' + ' ' + str(
|
|
||||||
input_dir) + ' ' + DICT + ' ' + MODEL_DIR + ' ' + str(mfa_dir)
|
|
||||||
os.system(CMD)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
# parse config and args
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Preprocess audio and then extract features.")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--input_dir",
|
|
||||||
type=str,
|
|
||||||
default="./input/baker_mini",
|
|
||||||
help="directory containing audio and label file")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--pretrained_model_dir",
|
|
||||||
type=str,
|
|
||||||
default="./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0",
|
|
||||||
help="Path to pretrained model")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--mfa_dir",
|
|
||||||
type=str,
|
|
||||||
default="./mfa_result",
|
|
||||||
help="directory to save aligned files")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--dump_dir",
|
|
||||||
type=str,
|
|
||||||
default="./dump",
|
|
||||||
help="directory to save feature files and metadata.")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--output_dir",
|
|
||||||
type=str,
|
|
||||||
default="./exp/default/",
|
|
||||||
help="directory to save finetune model.")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
'--lang',
|
|
||||||
type=str,
|
|
||||||
default='zh',
|
|
||||||
choices=['zh', 'en'],
|
|
||||||
help='Choose input audio language. zh or en')
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--ngpu", type=int, default=2, help="if ngpu=0, use cpu.")
|
|
||||||
|
|
||||||
parser.add_argument("--epoch", type=int, default=100, help="finetune epoch")
|
|
||||||
parser.add_argument(
|
|
||||||
"--finetune_config",
|
|
||||||
type=str,
|
|
||||||
default="./finetune.yaml",
|
|
||||||
help="Path to finetune config file")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
fs = 24000
|
|
||||||
n_shift = 300
|
|
||||||
input_dir = Path(args.input_dir).expanduser()
|
|
||||||
mfa_dir = Path(args.mfa_dir).expanduser()
|
|
||||||
mfa_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
dump_dir = Path(args.dump_dir).expanduser()
|
|
||||||
dump_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
output_dir = Path(args.output_dir).expanduser()
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
pretrained_model_dir = Path(args.pretrained_model_dir).expanduser()
|
|
||||||
|
|
||||||
# read config
|
|
||||||
config_file = pretrained_model_dir / "default.yaml"
|
|
||||||
with open(config_file) as f:
|
|
||||||
config = CfgNode(yaml.safe_load(f))
|
|
||||||
config.max_epoch = config.max_epoch + args.epoch
|
|
||||||
|
|
||||||
with open(args.finetune_config) as f2:
|
|
||||||
finetune_config = CfgNode(yaml.safe_load(f2))
|
|
||||||
config.batch_size = finetune_config.batch_size if finetune_config.batch_size > 0 else config.batch_size
|
|
||||||
config.optimizer.learning_rate = finetune_config.learning_rate if finetune_config.learning_rate > 0 else config.optimizer.learning_rate
|
|
||||||
config.num_snapshots = finetune_config.num_snapshots if finetune_config.num_snapshots > 0 else config.num_snapshots
|
|
||||||
frozen_layers = finetune_config.frozen_layers
|
|
||||||
assert type(frozen_layers) == list, "frozen_layers should be set a list."
|
|
||||||
|
|
||||||
if args.lang == 'en':
|
|
||||||
lexicon_file = DICT_EN
|
|
||||||
mfa_phone_file = MFA_PHONE_EN
|
|
||||||
elif args.lang == 'zh':
|
|
||||||
lexicon_file = DICT_ZH
|
|
||||||
mfa_phone_file = MFA_PHONE_ZH
|
|
||||||
else:
|
|
||||||
print('please input right lang!!')
|
|
||||||
|
|
||||||
print(f"finetune max_epoch: {config.max_epoch}")
|
|
||||||
print(f"finetune batch_size: {config.batch_size}")
|
|
||||||
print(f"finetune learning_rate: {config.optimizer.learning_rate}")
|
|
||||||
print(f"finetune num_snapshots: {config.num_snapshots}")
|
|
||||||
print(f"finetune frozen_layers: {frozen_layers}")
|
|
||||||
|
|
||||||
am_phone_file = pretrained_model_dir / "phone_id_map.txt"
|
|
||||||
label_file = input_dir / "labels.txt"
|
|
||||||
|
|
||||||
#check phone for mfa and am finetune
|
|
||||||
oov_words, oov_files, oov_file_words = get_check_result(
|
|
||||||
label_file, lexicon_file, mfa_phone_file, am_phone_file)
|
|
||||||
input_dir = get_single_label(label_file, oov_files, input_dir)
|
|
||||||
|
|
||||||
# get mfa result
|
|
||||||
get_mfa_result(input_dir, mfa_dir, args.lang)
|
|
||||||
|
|
||||||
# # generate durations.txt
|
|
||||||
duration_file = "./durations.txt"
|
|
||||||
gen_duration_from_textgrid(mfa_dir, duration_file, fs, n_shift)
|
|
||||||
|
|
||||||
# generate phone and speaker map files
|
|
||||||
extract_feature(duration_file, config, input_dir, dump_dir,
|
|
||||||
pretrained_model_dir)
|
|
||||||
|
|
||||||
# create finetune env
|
|
||||||
generate_finetune_env(output_dir, pretrained_model_dir)
|
|
||||||
|
|
||||||
# create a new args for training
|
|
||||||
train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir,
|
|
||||||
frozen_layers)
|
|
||||||
|
|
||||||
# finetune models
|
|
||||||
# dispatch
|
|
||||||
if args.ngpu > 1:
|
|
||||||
dist.spawn(train_sp, (train_args, config), nprocs=args.ngpu)
|
|
||||||
else:
|
|
||||||
train_sp(train_args, config)
|
|
@ -0,0 +1,38 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# parse config and args
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Preprocess audio and then extract features.")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--mfa_dir",
|
||||||
|
type=str,
|
||||||
|
default="./mfa_result",
|
||||||
|
help="directory to save aligned files")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
fs = 24000
|
||||||
|
n_shift = 300
|
||||||
|
duration_file = "./durations.txt"
|
||||||
|
mfa_dir = Path(args.mfa_dir).expanduser()
|
||||||
|
mfa_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
gen_duration_from_textgrid(mfa_dir, duration_file, fs, n_shift)
|
@ -0,0 +1,83 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
DICT_EN = 'tools/aligner/cmudict-0.7b'
|
||||||
|
DICT_ZH = 'tools/aligner/simple.lexicon'
|
||||||
|
MODEL_DIR_EN = 'tools/aligner/vctk_model.zip'
|
||||||
|
MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip'
|
||||||
|
MFA_PHONE_EN = 'tools/aligner/vctk_model/meta.yaml'
|
||||||
|
MFA_PHONE_ZH = 'tools/aligner/aishell3_model/meta.yaml'
|
||||||
|
MFA_PATH = 'tools/montreal-forced-aligner/bin'
|
||||||
|
os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
|
||||||
|
|
||||||
|
|
||||||
|
def get_mfa_result(
|
||||||
|
input_dir: Union[str, Path],
|
||||||
|
mfa_dir: Union[str, Path],
|
||||||
|
lang: str='en', ):
|
||||||
|
"""get mfa result
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_dir (Union[str, Path]): input dir including wav file and label
|
||||||
|
mfa_dir (Union[str, Path]): mfa result dir
|
||||||
|
lang (str, optional): input audio language. Defaults to 'en'.
|
||||||
|
"""
|
||||||
|
# MFA
|
||||||
|
if lang == 'en':
|
||||||
|
DICT = DICT_EN
|
||||||
|
MODEL_DIR = MODEL_DIR_EN
|
||||||
|
|
||||||
|
elif lang == 'zh':
|
||||||
|
DICT = DICT_ZH
|
||||||
|
MODEL_DIR = MODEL_DIR_ZH
|
||||||
|
else:
|
||||||
|
print('please input right lang!!')
|
||||||
|
|
||||||
|
CMD = 'mfa_align' + ' ' + str(
|
||||||
|
input_dir) + ' ' + DICT + ' ' + MODEL_DIR + ' ' + str(mfa_dir)
|
||||||
|
os.system(CMD)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# parse config and args
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Preprocess audio and then extract features.")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--input_dir",
|
||||||
|
type=str,
|
||||||
|
default="./input/baker_mini/newdir",
|
||||||
|
help="directory containing audio and label file")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--mfa_dir",
|
||||||
|
type=str,
|
||||||
|
default="./mfa_result",
|
||||||
|
help="directory to save aligned files")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--lang',
|
||||||
|
type=str,
|
||||||
|
default='zh',
|
||||||
|
choices=['zh', 'en'],
|
||||||
|
help='Choose input audio language. zh or en')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
get_mfa_result(
|
||||||
|
input_dir=args.input_dir, mfa_dir=args.mfa_dir, lang=args.lang)
|
@ -1,63 +0,0 @@
|
|||||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import List
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
|
|
||||||
def change_baker_label(baker_label_file: Union[str, Path],
|
|
||||||
out_label_file: Union[str, Path]):
|
|
||||||
"""change baker label file to regular label file
|
|
||||||
|
|
||||||
Args:
|
|
||||||
baker_label_file (Union[str, Path]): Original baker label file
|
|
||||||
out_label_file (Union[str, Path]): regular label file
|
|
||||||
"""
|
|
||||||
with open(baker_label_file) as f:
|
|
||||||
lines = f.readlines()
|
|
||||||
|
|
||||||
with open(out_label_file, "w") as fw:
|
|
||||||
for i in range(0, len(lines), 2):
|
|
||||||
utt_id = lines[i].split()[0]
|
|
||||||
transcription = lines[i + 1].strip()
|
|
||||||
fw.write(utt_id + "|" + transcription + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
def get_single_label(label_file: Union[str, Path],
|
|
||||||
oov_files: List[Union[str, Path]],
|
|
||||||
input_dir: Union[str, Path]):
|
|
||||||
"""Divide the label file into individual files according to label_file
|
|
||||||
|
|
||||||
Args:
|
|
||||||
label_file (str or Path): label file, format: utt_id|phones id
|
|
||||||
input_dir (Path): input dir including audios
|
|
||||||
"""
|
|
||||||
input_dir = Path(input_dir).expanduser()
|
|
||||||
new_dir = input_dir / "newdir"
|
|
||||||
new_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
with open(label_file, "r") as f:
|
|
||||||
for line in f.readlines():
|
|
||||||
utt_id = line.split("|")[0]
|
|
||||||
if utt_id not in oov_files:
|
|
||||||
transcription = line.split("|")[1].strip()
|
|
||||||
wav_file = str(input_dir) + "/" + utt_id + ".wav"
|
|
||||||
new_wav_file = str(new_dir) + "/" + utt_id + ".wav"
|
|
||||||
os.system("cp %s %s" % (wav_file, new_wav_file))
|
|
||||||
single_file = str(new_dir) + "/" + utt_id + ".txt"
|
|
||||||
with open(single_file, "w") as fw:
|
|
||||||
fw.write(transcription)
|
|
||||||
|
|
||||||
return new_dir
|
|
@ -0,0 +1,105 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
input_dir=./input/ljspeech_mini
|
||||||
|
newdir_name="newdir"
|
||||||
|
new_dir=${input_dir}/${newdir_name}
|
||||||
|
pretrained_model_dir=./pretrained_models/fastspeech2_vctk_ckpt_1.2.0
|
||||||
|
mfa_tools=./tools
|
||||||
|
mfa_dir=./mfa_result
|
||||||
|
dump_dir=./dump
|
||||||
|
output_dir=./exp/default
|
||||||
|
lang=en
|
||||||
|
ngpu=1
|
||||||
|
finetune_config=./conf/finetune.yaml
|
||||||
|
|
||||||
|
ckpt=snapshot_iter_66300
|
||||||
|
|
||||||
|
gpus=1
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus}
|
||||||
|
stage=0
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
|
||||||
|
# with the following command, you can choose the stage range you want to run
|
||||||
|
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||||
|
# this can not be mixed use with `$1`, `$2` ...
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||||
|
|
||||||
|
# check oov
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
echo "check oov"
|
||||||
|
python3 local/check_oov.py \
|
||||||
|
--input_dir=${input_dir} \
|
||||||
|
--pretrained_model_dir=${pretrained_model_dir} \
|
||||||
|
--newdir_name=${newdir_name} \
|
||||||
|
--lang=${lang}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get mfa result
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
echo "get mfa result"
|
||||||
|
python3 local/get_mfa_result.py \
|
||||||
|
--input_dir=${new_dir} \
|
||||||
|
--mfa_dir=${mfa_dir} \
|
||||||
|
--lang=${lang}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# generate durations.txt
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
echo "generate durations.txt"
|
||||||
|
python3 local/generate_duration.py \
|
||||||
|
--mfa_dir=${mfa_dir}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# extract feature
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
echo "extract feature"
|
||||||
|
python3 local/extract_feature.py \
|
||||||
|
--duration_file="./durations.txt" \
|
||||||
|
--input_dir=${new_dir} \
|
||||||
|
--dump_dir=${dump_dir} \
|
||||||
|
--pretrained_model_dir=${pretrained_model_dir}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# create finetune env
|
||||||
|
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||||
|
echo "create finetune env"
|
||||||
|
python3 local/prepare_env.py \
|
||||||
|
--pretrained_model_dir=${pretrained_model_dir} \
|
||||||
|
--output_dir=${output_dir}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# finetune
|
||||||
|
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||||
|
echo "finetune..."
|
||||||
|
python3 local/finetune.py \
|
||||||
|
--pretrained_model_dir=${pretrained_model_dir} \
|
||||||
|
--dump_dir=${dump_dir} \
|
||||||
|
--output_dir=${output_dir} \
|
||||||
|
--ngpu=${ngpu} \
|
||||||
|
--epoch=100 \
|
||||||
|
--finetune_config=${finetune_config}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# synthesize e2e
|
||||||
|
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
|
||||||
|
echo "in hifigan syn_e2e"
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=fastspeech2_vctk \
|
||||||
|
--am_config=${pretrained_model_dir}/default.yaml \
|
||||||
|
--am_ckpt=${output_dir}/checkpoints/${ckpt}.pdz \
|
||||||
|
--am_stat=${pretrained_model_dir}/speech_stats.npy \
|
||||||
|
--voc=hifigan_vctk \
|
||||||
|
--voc_config=pretrained_models/hifigan_vctk_ckpt_0.2.0/default.yaml \
|
||||||
|
--voc_ckpt=pretrained_models/hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
|
||||||
|
--voc_stat=pretrained_models/hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
|
||||||
|
--lang=en \
|
||||||
|
--text=${BIN_DIR}/../sentences_en.txt \
|
||||||
|
--output_dir=./test_e2e/ \
|
||||||
|
--phones_dict=${dump_dir}/phone_id_map.txt \
|
||||||
|
--speaker_dict=${dump_dir}/speaker_id_map.txt \
|
||||||
|
--spk_id=0
|
||||||
|
fi
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue