commit
e91bff79f5
@ -1,4 +1,8 @@
|
||||
# 目录
|
||||
build/
|
||||
output/
|
||||
libs/
|
||||
models/
|
||||
|
||||
# 符号连接
|
||||
dict
|
||||
|
@ -0,0 +1 @@
|
||||
src/TTSCppFrontend/build-depends.sh
|
@ -0,0 +1,21 @@
|
||||
# jieba conf
|
||||
--jieba_dict_path=./dict/jieba/jieba.dict.utf8
|
||||
--jieba_hmm_path=./dict/jieba/hmm_model.utf8
|
||||
--jieba_user_dict_path=./dict/jieba/user.dict.utf8
|
||||
--jieba_idf_path=./dict/jieba/idf.utf8
|
||||
--jieba_stop_word_path=./dict/jieba/stop_words.utf8
|
||||
|
||||
# dict conf fastspeech2_0.4
|
||||
--seperate_tone=false
|
||||
--word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
|
||||
--phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
|
||||
--tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
|
||||
|
||||
# dict conf speedyspeech_0.5
|
||||
#--seperate_tone=true
|
||||
#--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
|
||||
#--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
|
||||
#--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
|
||||
|
||||
# dict of tranditional_to_simplified
|
||||
--trand2simpd_path=./dict/tranditional_to_simplified/trand2simp.txt
|
@ -0,0 +1 @@
|
||||
../../TTSCppFrontend/
|
@ -0,0 +1 @@
|
||||
TTSCppFrontend/third-party
|
@ -0,0 +1,2 @@
|
||||
build/
|
||||
dict/
|
@ -0,0 +1,63 @@
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
project(paddlespeech_tts_cpp)
|
||||
|
||||
|
||||
########## Global Options ##########
|
||||
|
||||
option(WITH_FRONT_DEMO "Build front demo" ON)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||
set(ABSL_PROPAGATE_CXX_STD ON)
|
||||
|
||||
|
||||
########## Dependencies ##########
|
||||
|
||||
set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/third-party/build/lib/pkgconfig:${CMAKE_SOURCE_DIR}/third-party/build/lib64/pkgconfig")
|
||||
find_package(PkgConfig REQUIRED)
|
||||
|
||||
# It is hard to load xxx-config.cmake in a custom location, so use pkgconfig instead.
|
||||
pkg_check_modules(ABSL REQUIRED absl_strings IMPORTED_TARGET)
|
||||
pkg_check_modules(GFLAGS REQUIRED gflags IMPORTED_TARGET)
|
||||
pkg_check_modules(GLOG REQUIRED libglog IMPORTED_TARGET)
|
||||
|
||||
# load header-only libraries
|
||||
include_directories(
|
||||
${CMAKE_SOURCE_DIR}/third-party/build/src/cppjieba/include
|
||||
${CMAKE_SOURCE_DIR}/third-party/build/src/limonp/include
|
||||
)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
|
||||
########## paddlespeech_tts_front ##########
|
||||
|
||||
include_directories(src)
|
||||
|
||||
file(GLOB FRONT_SOURCES
|
||||
./src/base/*.cpp
|
||||
./src/front/*.cpp
|
||||
)
|
||||
add_library(paddlespeech_tts_front STATIC ${FRONT_SOURCES})
|
||||
|
||||
target_link_libraries(
|
||||
paddlespeech_tts_front
|
||||
PUBLIC
|
||||
PkgConfig::GFLAGS
|
||||
PkgConfig::GLOG
|
||||
PkgConfig::ABSL
|
||||
Threads::Threads
|
||||
)
|
||||
|
||||
|
||||
########## tts_front_demo ##########
|
||||
|
||||
if (WITH_FRONT_DEMO)
|
||||
|
||||
file(GLOB FRONT_DEMO_SOURCES front_demo/*.cpp)
|
||||
add_executable(tts_front_demo ${FRONT_DEMO_SOURCES})
|
||||
|
||||
target_include_directories(tts_front_demo PRIVATE ./front_demo)
|
||||
target_link_libraries(tts_front_demo PRIVATE paddlespeech_tts_front)
|
||||
|
||||
endif (WITH_FRONT_DEMO)
|
@ -0,0 +1,56 @@
|
||||
# PaddleSpeech TTS CPP Frontend
|
||||
|
||||
A TTS frontend that implements text-to-phoneme conversion.
|
||||
|
||||
Currently it only supports Chinese, any English word will crash the demo.
|
||||
|
||||
## Install Build Tools
|
||||
|
||||
```bash
|
||||
# Ubuntu
|
||||
sudo apt install build-essential cmake pkg-config
|
||||
|
||||
# CentOS
|
||||
sudo yum groupinstall "Development Tools"
|
||||
sudo yum install cmake
|
||||
```
|
||||
|
||||
If your cmake version is too old, you can go here to download a precompiled new version: https://cmake.org/download/
|
||||
|
||||
## Build
|
||||
|
||||
```bash
|
||||
# Build with all CPU cores
|
||||
./build.sh
|
||||
|
||||
# Build with 1 core
|
||||
./build.sh -j1
|
||||
```
|
||||
|
||||
Dependent libraries will be automatically downloaded to the `third-party/build` folder.
|
||||
|
||||
If the download speed is too slow, you can open [third-party/CMakeLists.txt](third-party/CMakeLists.txt) and modify `GIT_REPOSITORY` URLs.
|
||||
|
||||
## Download dictionary files
|
||||
|
||||
```bash
|
||||
./download.sh
|
||||
```
|
||||
|
||||
## Run
|
||||
You can change `--phone2id_path` in `./front_demo/front.conf` to the `phone_id_map.txt` of your own acoustic model.
|
||||
|
||||
```bash
|
||||
./run_front_demo.sh
|
||||
./run_front_demo.sh --help
|
||||
./run_front_demo.sh --sentence "这是语音合成服务的文本前端,用于将文本转换为音素序号数组。"
|
||||
./run_front_demo.sh --front_conf ./front_demo/front.conf --sentence "你还需要一个语音合成后端才能将其转换为实际的声音。"
|
||||
```
|
||||
|
||||
## Clean
|
||||
|
||||
```bash
|
||||
./clean.sh
|
||||
```
|
||||
|
||||
The folders `front_demo/dict`, `build` and `third-party/build` will be deleted.
|
@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -x
|
||||
|
||||
cd "$(dirname "$(realpath "$0")")"
|
||||
|
||||
cd ./third-party
|
||||
|
||||
mkdir -p build
|
||||
cd build
|
||||
|
||||
cmake ..
|
||||
|
||||
if [ "$*" = "" ]; then
|
||||
make -j$(nproc)
|
||||
else
|
||||
make "$@"
|
||||
fi
|
||||
|
||||
echo "Done."
|
@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -x
|
||||
|
||||
cd "$(dirname "$(realpath "$0")")"
|
||||
|
||||
echo "************* Download & Build Dependencies *************"
|
||||
./build-depends.sh "$@"
|
||||
|
||||
echo "************* Build Front Lib and Demo *************"
|
||||
mkdir -p ./build
|
||||
cd ./build
|
||||
cmake ..
|
||||
|
||||
if [ "$*" = "" ]; then
|
||||
make -j$(nproc)
|
||||
else
|
||||
make "$@"
|
||||
fi
|
||||
|
||||
echo "Done."
|
@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -x
|
||||
|
||||
cd "$(dirname "$(realpath "$0")")"
|
||||
rm -rf "./front_demo/dict"
|
||||
rm -rf "./build"
|
||||
rm -rf "./third-party/build"
|
||||
|
||||
echo "Done."
|
@ -0,0 +1,62 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
cd "$(dirname "$(realpath "$0")")"
|
||||
|
||||
download() {
|
||||
file="$1"
|
||||
url="$2"
|
||||
md5="$3"
|
||||
dir="$4"
|
||||
|
||||
cd "$dir"
|
||||
|
||||
if [ -f "$file" ] && [ "$(md5sum "$file" | awk '{ print $1 }')" = "$md5" ]; then
|
||||
echo "File $file (MD5: $md5) has been downloaded."
|
||||
else
|
||||
echo "Downloading $file..."
|
||||
wget -O "$file" "$url"
|
||||
|
||||
# MD5 verify
|
||||
fileMd5="$(md5sum "$file" | awk '{ print $1 }')"
|
||||
if [ "$fileMd5" == "$md5" ]; then
|
||||
echo "File $file (MD5: $md5) has been downloaded."
|
||||
else
|
||||
echo "MD5 mismatch, file may be corrupt"
|
||||
echo "$file MD5: $fileMd5, it should be $md5"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Extracting $file..."
|
||||
echo '-----------------------'
|
||||
tar -vxf "$file"
|
||||
echo '======================='
|
||||
}
|
||||
|
||||
########################################
|
||||
|
||||
DIST_DIR="$PWD/front_demo/dict"
|
||||
|
||||
mkdir -p "$DIST_DIR"
|
||||
|
||||
download 'fastspeech2_nosil_baker_ckpt_0.4.tar.gz' \
|
||||
'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/fastspeech2_nosil_baker_ckpt_0.4.tar.gz' \
|
||||
'7bf1bab1737375fa123c413eb429c573' \
|
||||
"$DIST_DIR"
|
||||
|
||||
download 'speedyspeech_nosil_baker_ckpt_0.5.tar.gz' \
|
||||
'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/speedyspeech_nosil_baker_ckpt_0.5.tar.gz' \
|
||||
'0b7754b21f324789aef469c61f4d5b8f' \
|
||||
"$DIST_DIR"
|
||||
|
||||
download 'jieba.tar.gz' \
|
||||
'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/jieba.tar.gz' \
|
||||
'6d30f426bd8c0025110a483f051315ca' \
|
||||
"$DIST_DIR"
|
||||
|
||||
download 'tranditional_to_simplified.tar.gz' \
|
||||
'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/tranditional_to_simplified.tar.gz' \
|
||||
'258f5b59d5ebfe96d02007ca1d274a7f' \
|
||||
"$DIST_DIR"
|
||||
|
||||
echo "Done."
|
@ -0,0 +1,21 @@
|
||||
# jieba conf
|
||||
--jieba_dict_path=./front_demo/dict/jieba/jieba.dict.utf8
|
||||
--jieba_hmm_path=./front_demo/dict/jieba/hmm_model.utf8
|
||||
--jieba_user_dict_path=./front_demo/dict/jieba/user.dict.utf8
|
||||
--jieba_idf_path=./front_demo/dict/jieba/idf.utf8
|
||||
--jieba_stop_word_path=./front_demo/dict/jieba/stop_words.utf8
|
||||
|
||||
# dict conf fastspeech2_0.4
|
||||
--seperate_tone=false
|
||||
--word2phone_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
|
||||
--phone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
|
||||
--tone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
|
||||
|
||||
# dict conf speedyspeech_0.5
|
||||
#--seperate_tone=true
|
||||
#--word2phone_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
|
||||
#--phone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
|
||||
#--tone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
|
||||
|
||||
# dict of tranditional_to_simplified
|
||||
--trand2simpd_path=./front_demo/dict/tranditional_to_simplified/trand2simp.txt
|
@ -0,0 +1,79 @@
|
||||
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <gflags/gflags.h>
|
||||
#include <glog/logging.h>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include "front/front_interface.h"
|
||||
|
||||
DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized");
|
||||
DEFINE_string(front_conf, "./front_demo/front.conf", "Front conf file");
|
||||
// DEFINE_string(seperate_tone, "true", "If true, get phoneids and tonesid");
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
gflags::ParseCommandLineFlags(&argc, &argv, true);
|
||||
// 实例化文本前端引擎
|
||||
ppspeech::FrontEngineInterface* front_inst = nullptr;
|
||||
front_inst = new ppspeech::FrontEngineInterface(FLAGS_front_conf);
|
||||
if ((!front_inst) || (front_inst->init())) {
|
||||
LOG(ERROR) << "Creater tts engine failed!";
|
||||
if (front_inst != nullptr) {
|
||||
delete front_inst;
|
||||
}
|
||||
front_inst = nullptr;
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::wstring ws_sentence = ppspeech::utf8string2wstring(FLAGS_sentence);
|
||||
|
||||
// 繁体转简体
|
||||
std::wstring sentence_simp;
|
||||
front_inst->Trand2Simp(ws_sentence, &sentence_simp);
|
||||
ws_sentence = sentence_simp;
|
||||
|
||||
std::string s_sentence;
|
||||
std::vector<std::wstring> sentence_part;
|
||||
std::vector<int> phoneids = {};
|
||||
std::vector<int> toneids = {};
|
||||
|
||||
// 根据标点进行分句
|
||||
LOG(INFO) << "Start to segment sentences by punctuation";
|
||||
front_inst->SplitByPunc(ws_sentence, &sentence_part);
|
||||
LOG(INFO) << "Segment sentences through punctuation successfully";
|
||||
|
||||
// 分句后获取音素id
|
||||
LOG(INFO)
|
||||
<< "Start to get the phoneme and tone id sequence of each sentence";
|
||||
for (int i = 0; i < sentence_part.size(); i++) {
|
||||
LOG(INFO) << "Raw sentence is: "
|
||||
<< ppspeech::wstring2utf8string(sentence_part[i]);
|
||||
front_inst->SentenceNormalize(&sentence_part[i]);
|
||||
s_sentence = ppspeech::wstring2utf8string(sentence_part[i]);
|
||||
LOG(INFO) << "After normalization sentence is: " << s_sentence;
|
||||
|
||||
if (0 != front_inst->GetSentenceIds(s_sentence, &phoneids, &toneids)) {
|
||||
LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed";
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
LOG(INFO) << "The phoneids of the sentence is: "
|
||||
<< limonp::Join(phoneids.begin(), phoneids.end(), " ");
|
||||
LOG(INFO) << "The toneids of the sentence is: "
|
||||
<< limonp::Join(toneids.begin(), toneids.end(), " ");
|
||||
LOG(INFO) << "Get the phoneme id sequence of each sentence successfully";
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
@ -0,0 +1,111 @@
|
||||
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import configparser
|
||||
|
||||
from paddlespeech.t2s.frontend.zh_frontend import Frontend
|
||||
|
||||
|
||||
def get_phone(frontend,
|
||||
word,
|
||||
merge_sentences=True,
|
||||
print_info=False,
|
||||
robot=False,
|
||||
get_tone_ids=False):
|
||||
phonemes = frontend.get_phonemes(word, merge_sentences, print_info, robot)
|
||||
# Some optimizations
|
||||
phones, tones = frontend._get_phone_tone(phonemes[0], get_tone_ids)
|
||||
#print(type(phones), phones)
|
||||
#print(type(tones), tones)
|
||||
return phones, tones
|
||||
|
||||
|
||||
def gen_word2phone_dict(frontend,
|
||||
jieba_words_dict,
|
||||
word2phone_dict,
|
||||
get_tone=False):
|
||||
with open(jieba_words_dict, "r") as f1, open(word2phone_dict, "w+") as f2:
|
||||
for line in f1.readlines():
|
||||
word = line.split(" ")[0]
|
||||
phone, tone = get_phone(frontend, word, get_tone_ids=get_tone)
|
||||
phone_str = ""
|
||||
|
||||
if tone:
|
||||
assert (len(phone) == len(tone))
|
||||
for i in range(len(tone)):
|
||||
phone_tone = phone[i] + tone[i]
|
||||
phone_str += (" " + phone_tone)
|
||||
phone_str = phone_str.strip("sp0").strip(" ")
|
||||
else:
|
||||
for x in phone:
|
||||
phone_str += (" " + x)
|
||||
phone_str = phone_str.strip("sp").strip(" ")
|
||||
print(phone_str)
|
||||
f2.write(word + " " + phone_str + "\n")
|
||||
print("Generate word2phone dict successfully.")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate dictionary")
|
||||
parser.add_argument(
|
||||
"--config", type=str, default="./config.ini", help="config file.")
|
||||
parser.add_argument(
|
||||
"--am_type",
|
||||
type=str,
|
||||
default="fastspeech2",
|
||||
help="fastspeech2 or speedyspeech")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Read config
|
||||
cf = configparser.ConfigParser()
|
||||
cf.read(args.config)
|
||||
jieba_words_dict_file = cf.get("jieba",
|
||||
"jieba_words_dict") # get words dict
|
||||
|
||||
am_type = args.am_type
|
||||
if (am_type == "fastspeech2"):
|
||||
phone2id_dict_file = cf.get(am_type, "phone2id_dict")
|
||||
word2phone_dict_file = cf.get(am_type, "word2phone_dict")
|
||||
|
||||
frontend = Frontend(phone_vocab_path=phone2id_dict_file)
|
||||
print("frontend done!")
|
||||
|
||||
gen_word2phone_dict(
|
||||
frontend,
|
||||
jieba_words_dict_file,
|
||||
word2phone_dict_file,
|
||||
get_tone=False)
|
||||
|
||||
elif (am_type == "speedyspeech"):
|
||||
phone2id_dict_file = cf.get(am_type, "phone2id_dict")
|
||||
tone2id_dict_file = cf.get(am_type, "tone2id_dict")
|
||||
word2phone_dict_file = cf.get(am_type, "word2phone_dict")
|
||||
|
||||
frontend = Frontend(
|
||||
phone_vocab_path=phone2id_dict_file,
|
||||
tone_vocab_path=tone2id_dict_file)
|
||||
print("frontend done!")
|
||||
|
||||
gen_word2phone_dict(
|
||||
frontend,
|
||||
jieba_words_dict_file,
|
||||
word2phone_dict_file,
|
||||
get_tone=True)
|
||||
|
||||
else:
|
||||
print("Please set correct am type, fastspeech2 or speedyspeech.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,35 @@
|
||||
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
PHONESFILE = "./dict/phones.txt"
|
||||
PHONES_ID_FILE = "./dict/phonesid.dict"
|
||||
TONESFILE = "./dict/tones.txt"
|
||||
TONES_ID_FILE = "./dict/tonesid.dict"
|
||||
|
||||
|
||||
def GenIdFile(file, idfile):
|
||||
id = 2
|
||||
with open(file, 'r') as f1, open(idfile, "w+") as f2:
|
||||
f2.write("<pad> 0\n")
|
||||
f2.write("<unk> 1\n")
|
||||
for line in f1.readlines():
|
||||
phone = line.strip()
|
||||
print(phone + " " + str(id) + "\n")
|
||||
f2.write(phone + " " + str(id) + "\n")
|
||||
id += 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
GenIdFile(PHONESFILE, PHONES_ID_FILE)
|
||||
GenIdFile(TONESFILE, TONES_ID_FILE)
|
@ -0,0 +1,55 @@
|
||||
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import re
|
||||
|
||||
from pypinyin import lazy_pinyin
|
||||
from pypinyin import Style
|
||||
|
||||
worddict = "./dict/jieba_part.dict.utf8"
|
||||
newdict = "./dict/word_phones.dict"
|
||||
|
||||
|
||||
def GenPhones(initials, finals, seperate=True):
|
||||
|
||||
phones = []
|
||||
for c, v in zip(initials, finals):
|
||||
if re.match(r'i\d', v):
|
||||
if c in ['z', 'c', 's']:
|
||||
v = re.sub('i', 'ii', v)
|
||||
elif c in ['zh', 'ch', 'sh', 'r']:
|
||||
v = re.sub('i', 'iii', v)
|
||||
if c:
|
||||
if seperate is True:
|
||||
phones.append(c + '0')
|
||||
elif seperate is False:
|
||||
phones.append(c)
|
||||
else:
|
||||
print("Not sure whether phone and tone need to be separated")
|
||||
if v:
|
||||
phones.append(v)
|
||||
return phones
|
||||
|
||||
|
||||
with open(worddict, "r") as f1, open(newdict, "w+") as f2:
|
||||
for line in f1.readlines():
|
||||
word = line.split(" ")[0]
|
||||
initials = lazy_pinyin(
|
||||
word, neutral_tone_with_five=True, style=Style.INITIALS)
|
||||
finals = lazy_pinyin(
|
||||
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
||||
|
||||
phones = GenPhones(initials, finals, True)
|
||||
|
||||
temp = " ".join(phones)
|
||||
f2.write(word + " " + temp + "\n")
|
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -x
|
||||
|
||||
cd "$(dirname "$(realpath "$0")")"
|
||||
|
||||
./build/tts_front_demo "$@"
|
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "base/type_conv.h"
|
||||
|
||||
namespace ppspeech {
|
||||
// wstring to string
|
||||
std::string wstring2utf8string(const std::wstring& str) {
|
||||
static std::wstring_convert<std::codecvt_utf8<wchar_t>> strCnv;
|
||||
return strCnv.to_bytes(str);
|
||||
}
|
||||
|
||||
// string to wstring
|
||||
std::wstring utf8string2wstring(const std::string& str) {
|
||||
static std::wstring_convert<std::codecvt_utf8<wchar_t>> strCnv;
|
||||
return strCnv.from_bytes(str);
|
||||
}
|
||||
} // namespace ppspeech
|
@ -0,0 +1,31 @@
|
||||
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef BASE_TYPE_CONVC_H
|
||||
#define BASE_TYPE_CONVC_H
|
||||
|
||||
#include <codecvt>
|
||||
#include <locale>
|
||||
#include <string>
|
||||
|
||||
|
||||
namespace ppspeech {
|
||||
// wstring to string
|
||||
std::string wstring2utf8string(const std::wstring& str);
|
||||
|
||||
// string to wstring
|
||||
std::wstring utf8string2wstring(const std::string& str);
|
||||
}
|
||||
|
||||
#endif // BASE_TYPE_CONVC_H
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,77 @@
|
||||
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#ifndef PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H
|
||||
#define PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H
|
||||
|
||||
#include <glog/logging.h>
|
||||
#include <codecvt>
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include "absl/strings/str_split.h"
|
||||
#include "absl/strings/strip.h"
|
||||
#include "base/type_conv.h"
|
||||
|
||||
namespace ppspeech {
|
||||
|
||||
class TextNormalizer {
|
||||
public:
|
||||
TextNormalizer() { InitMap(); }
|
||||
~TextNormalizer() {}
|
||||
|
||||
int InitMap();
|
||||
int Replace(std::wstring *sentence,
|
||||
const int &pos,
|
||||
const int &len,
|
||||
const std::wstring &repstr);
|
||||
int SplitByPunc(const std::wstring &sentence,
|
||||
std::vector<std::wstring> *sentence_part);
|
||||
|
||||
std::string CreateTextValue(const std::string &num, bool use_zero = true);
|
||||
std::string SingleDigit2Text(const std::string &num_str,
|
||||
bool alt_one = false);
|
||||
std::string SingleDigit2Text(const std::wstring &num, bool alt_one = false);
|
||||
std::string MultiDigit2Text(const std::string &num_str,
|
||||
bool alt_one = false,
|
||||
bool use_zero = true);
|
||||
std::string MultiDigit2Text(const std::wstring &num,
|
||||
bool alt_one = false,
|
||||
bool use_zero = true);
|
||||
std::string Digits2Text(const std::string &num_str);
|
||||
std::string Digits2Text(const std::wstring &num);
|
||||
|
||||
int ReData(std::wstring *sentence);
|
||||
int ReData2(std::wstring *sentence);
|
||||
int ReTime(std::wstring *sentence);
|
||||
int ReTemperature(std::wstring *sentence);
|
||||
int ReFrac(std::wstring *sentence);
|
||||
int RePercentage(std::wstring *sentence);
|
||||
int ReMobilePhone(std::wstring *sentence);
|
||||
int RePhone(std::wstring *sentence);
|
||||
int ReRange(std::wstring *sentence);
|
||||
int ReInterger(std::wstring *sentence);
|
||||
int ReDecimalNum(std::wstring *sentence);
|
||||
int RePositiveQuantifiers(std::wstring *sentence);
|
||||
int ReDefalutNum(std::wstring *sentence);
|
||||
int ReNumber(std::wstring *sentence);
|
||||
int SentenceNormalize(std::wstring *sentence);
|
||||
|
||||
|
||||
private:
|
||||
std::map<std::string, std::string> digits_map;
|
||||
std::map<int, std::string> units_map;
|
||||
};
|
||||
} // namespace ppspeech
|
||||
|
||||
#endif
|
@ -0,0 +1,64 @@
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
project(tts_third_party_libs)
|
||||
|
||||
include(ExternalProject)
|
||||
|
||||
# gflags
|
||||
ExternalProject_Add(gflags
|
||||
GIT_REPOSITORY https://github.com/gflags/gflags.git
|
||||
GIT_TAG v2.2.2
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||
-DBUILD_STATIC_LIBS=OFF
|
||||
-DBUILD_SHARED_LIBS=ON
|
||||
)
|
||||
|
||||
# glog
|
||||
ExternalProject_Add(
|
||||
glog
|
||||
GIT_REPOSITORY https://github.com/google/glog.git
|
||||
GIT_TAG v0.6.0
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||
DEPENDS gflags
|
||||
)
|
||||
|
||||
# abseil
|
||||
ExternalProject_Add(
|
||||
abseil
|
||||
GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
|
||||
GIT_TAG 20230125.1
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||
-DABSL_PROPAGATE_CXX_STD=ON
|
||||
)
|
||||
|
||||
# cppjieba (header-only)
|
||||
ExternalProject_Add(
|
||||
cppjieba
|
||||
GIT_REPOSITORY https://github.com/yanyiwu/cppjieba.git
|
||||
GIT_TAG v5.0.3
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
INSTALL_COMMAND ""
|
||||
TEST_COMMAND ""
|
||||
)
|
||||
|
||||
# limonp (header-only)
|
||||
ExternalProject_Add(
|
||||
limonp
|
||||
GIT_REPOSITORY https://github.com/yanyiwu/limonp.git
|
||||
GIT_TAG v0.6.6
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
INSTALL_COMMAND ""
|
||||
TEST_COMMAND ""
|
||||
)
|
After Width: | Height: | Size: 294 KiB |
@ -0,0 +1,98 @@
|
||||
############################################
|
||||
# Network Architecture #
|
||||
############################################
|
||||
cmvn_file:
|
||||
cmvn_file_type: "json"
|
||||
# encoder related
|
||||
encoder: squeezeformer
|
||||
encoder_conf:
|
||||
encoder_dim: 256 # dimension of attention
|
||||
output_size: 256 # dimension of output
|
||||
attention_heads: 4
|
||||
num_blocks: 12 # the number of encoder blocks
|
||||
reduce_idx: 5
|
||||
recover_idx: 11
|
||||
feed_forward_expansion_factor: 8
|
||||
input_dropout_rate: 0.1
|
||||
feed_forward_dropout_rate: 0.1
|
||||
attention_dropout_rate: 0.1
|
||||
adaptive_scale: true
|
||||
cnn_module_kernel: 31
|
||||
normalize_before: false
|
||||
activation_type: 'swish'
|
||||
pos_enc_layer_type: 'rel_pos'
|
||||
time_reduction_layer_type: 'stream'
|
||||
causal: true
|
||||
use_dynamic_chunk: true
|
||||
use_dynamic_left_chunk: false
|
||||
|
||||
# decoder related
|
||||
decoder: transformer
|
||||
decoder_conf:
|
||||
attention_heads: 4
|
||||
linear_units: 2048
|
||||
num_blocks: 6
|
||||
dropout_rate: 0.1 # sublayer output dropout
|
||||
positional_dropout_rate: 0.1
|
||||
self_attention_dropout_rate: 0.0
|
||||
src_attention_dropout_rate: 0.0
|
||||
# hybrid CTC/attention
|
||||
model_conf:
|
||||
ctc_weight: 0.3
|
||||
lsm_weight: 0.1 # label smoothing option
|
||||
length_normalized_loss: false
|
||||
init_type: 'kaiming_uniform' # !Warning: need to convergence
|
||||
|
||||
###########################################
|
||||
# Data #
|
||||
###########################################
|
||||
|
||||
train_manifest: data/manifest.train
|
||||
dev_manifest: data/manifest.dev
|
||||
test_manifest: data/manifest.test
|
||||
|
||||
|
||||
###########################################
|
||||
# Dataloader #
|
||||
###########################################
|
||||
|
||||
vocab_filepath: data/lang_char/vocab.txt
|
||||
spm_model_prefix: ''
|
||||
unit_type: 'char'
|
||||
preprocess_config: conf/preprocess.yaml
|
||||
feat_dim: 80
|
||||
stride_ms: 10.0
|
||||
window_ms: 25.0
|
||||
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||
batch_size: 32
|
||||
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||
minibatches: 0 # for debug
|
||||
batch_count: auto
|
||||
batch_bins: 0
|
||||
batch_frames_in: 0
|
||||
batch_frames_out: 0
|
||||
batch_frames_inout: 0
|
||||
num_workers: 2
|
||||
subsampling_factor: 1
|
||||
num_encs: 1
|
||||
|
||||
###########################################
|
||||
# Training #
|
||||
###########################################
|
||||
n_epoch: 240
|
||||
accum_grad: 1
|
||||
global_grad_clip: 5.0
|
||||
dist_sampler: True
|
||||
optim: adam
|
||||
optim_conf:
|
||||
lr: 0.001
|
||||
weight_decay: 1.0e-6
|
||||
scheduler: warmuplr
|
||||
scheduler_conf:
|
||||
warmup_steps: 25000
|
||||
lr_decay: 1.0
|
||||
log_interval: 100
|
||||
checkpoint:
|
||||
kbest_n: 50
|
||||
latest_n: 5
|
@ -0,0 +1,93 @@
|
||||
############################################
|
||||
# Network Architecture #
|
||||
############################################
|
||||
cmvn_file:
|
||||
cmvn_file_type: "json"
|
||||
# encoder related
|
||||
encoder: squeezeformer
|
||||
encoder_conf:
|
||||
encoder_dim: 256 # dimension of attention
|
||||
output_size: 256 # dimension of output
|
||||
attention_heads: 4
|
||||
num_blocks: 12 # the number of encoder blocks
|
||||
reduce_idx: 5
|
||||
recover_idx: 11
|
||||
feed_forward_expansion_factor: 8
|
||||
input_dropout_rate: 0.1
|
||||
feed_forward_dropout_rate: 0.1
|
||||
attention_dropout_rate: 0.1
|
||||
adaptive_scale: true
|
||||
cnn_module_kernel: 31
|
||||
normalize_before: false
|
||||
activation_type: 'swish'
|
||||
pos_enc_layer_type: 'rel_pos'
|
||||
time_reduction_layer_type: 'conv1d'
|
||||
|
||||
# decoder related
|
||||
decoder: transformer
|
||||
decoder_conf:
|
||||
attention_heads: 4
|
||||
linear_units: 2048
|
||||
num_blocks: 6
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
self_attention_dropout_rate: 0.0
|
||||
src_attention_dropout_rate: 0.0
|
||||
|
||||
# hybrid CTC/attention
|
||||
model_conf:
|
||||
ctc_weight: 0.3
|
||||
lsm_weight: 0.1 # label smoothing option
|
||||
length_normalized_loss: false
|
||||
init_type: 'kaiming_uniform' # !Warning: need to convergence
|
||||
|
||||
###########################################
|
||||
# Data #
|
||||
###########################################
|
||||
train_manifest: data/manifest.train
|
||||
dev_manifest: data/manifest.dev
|
||||
test_manifest: data/manifest.test
|
||||
|
||||
###########################################
|
||||
# Dataloader #
|
||||
###########################################
|
||||
vocab_filepath: data/lang_char/vocab.txt
|
||||
spm_model_prefix: ''
|
||||
unit_type: 'char'
|
||||
preprocess_config: conf/preprocess.yaml
|
||||
feat_dim: 80
|
||||
stride_ms: 10.0
|
||||
window_ms: 25.0
|
||||
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||
batch_size: 32
|
||||
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||
minibatches: 0 # for debug
|
||||
batch_count: auto
|
||||
batch_bins: 0
|
||||
batch_frames_in: 0
|
||||
batch_frames_out: 0
|
||||
batch_frames_inout: 0
|
||||
num_workers: 2
|
||||
subsampling_factor: 1
|
||||
num_encs: 1
|
||||
|
||||
###########################################
|
||||
# Training #
|
||||
###########################################
|
||||
n_epoch: 150
|
||||
accum_grad: 8
|
||||
global_grad_clip: 5.0
|
||||
dist_sampler: False
|
||||
optim: adam
|
||||
optim_conf:
|
||||
lr: 0.002
|
||||
weight_decay: 1.0e-6
|
||||
scheduler: warmuplr
|
||||
scheduler_conf:
|
||||
warmup_steps: 25000
|
||||
lr_decay: 1.0
|
||||
log_interval: 100
|
||||
checkpoint:
|
||||
kbest_n: 50
|
||||
latest_n: 5
|
@ -0,0 +1,18 @@
|
||||
# AISHELL
|
||||
|
||||
## Version
|
||||
|
||||
* paddle version: develop (commit id: daea892c67e85da91906864de40ce9f6f1b893ae)
|
||||
* paddlespeech version: develop (commit id: c14b4238b256693281e59605abff7c9435b3e2b2)
|
||||
* paddlenlp version: 2.5.2
|
||||
|
||||
## Device
|
||||
* python: 3.7
|
||||
* cuda: 10.2
|
||||
* cudnn: 7.6
|
||||
|
||||
## Result
|
||||
train: Epoch 80, 2*V100-32G, batchsize:5
|
||||
| Model | Params | Config | Augmentation| Test set | Decode method | WER |
|
||||
| --- | --- | --- | --- | --- | --- | --- |
|
||||
| wav2vec2ASR | 324.49 M | conf/wav2vec2ASR.yaml | spec_aug | test-set | greedy search | 5.1009 |
|
@ -0,0 +1,168 @@
|
||||
############################################
|
||||
# Network Architecture #
|
||||
############################################
|
||||
freeze_wav2vec2: False
|
||||
normalize_wav: True
|
||||
output_norm: True
|
||||
init_type: 'kaiming_uniform' # !Warning: need to convergence
|
||||
enc:
|
||||
input_shape: 1024
|
||||
dnn_blocks: 3
|
||||
dnn_neurons: 1024
|
||||
activation: True
|
||||
normalization: True
|
||||
dropout_rate: [0.15, 0.15, 0.0]
|
||||
ctc:
|
||||
enc_n_units: 1024
|
||||
blank_id: 0
|
||||
dropout_rate: 0.0
|
||||
|
||||
audio_augment:
|
||||
speeds: [90, 100, 110]
|
||||
|
||||
spec_augment:
|
||||
time_warp: True
|
||||
time_warp_window: 5
|
||||
time_warp_mode: bicubic
|
||||
freq_mask: True
|
||||
n_freq_mask: 2
|
||||
time_mask: True
|
||||
n_time_mask: 2
|
||||
replace_with_zero: False
|
||||
freq_mask_width: 30
|
||||
time_mask_width: 40
|
||||
wav2vec2_params_path: exp/wav2vec2/chinese-wav2vec2-large.pdparams
|
||||
|
||||
|
||||
############################################
|
||||
# Wav2Vec2.0 #
|
||||
############################################
|
||||
# vocab_size: 1000000
|
||||
hidden_size: 1024
|
||||
num_hidden_layers: 24
|
||||
num_attention_heads: 16
|
||||
intermediate_size: 4096
|
||||
hidden_act: gelu
|
||||
hidden_dropout: 0.1
|
||||
activation_dropout: 0.0
|
||||
attention_dropout: 0.1
|
||||
feat_proj_dropout: 0.1
|
||||
feat_quantizer_dropout: 0.0
|
||||
final_dropout: 0.0
|
||||
layerdrop: 0.1
|
||||
initializer_range: 0.02
|
||||
layer_norm_eps: 1e-5
|
||||
feat_extract_norm: layer
|
||||
feat_extract_activation: gelu
|
||||
conv_dim: [512, 512, 512, 512, 512, 512, 512]
|
||||
conv_stride: [5, 2, 2, 2, 2, 2, 2]
|
||||
conv_kernel: [10, 3, 3, 3, 3, 2, 2]
|
||||
conv_bias: True
|
||||
num_conv_pos_embeddings: 128
|
||||
num_conv_pos_embedding_groups: 16
|
||||
do_stable_layer_norm: True
|
||||
apply_spec_augment: False
|
||||
mask_channel_length: 10
|
||||
mask_channel_min_space: 1
|
||||
mask_channel_other: 0.0
|
||||
mask_channel_prob: 0.0
|
||||
mask_channel_selection: static
|
||||
mask_feature_length: 10
|
||||
mask_feature_min_masks: 0
|
||||
mask_feature_prob: 0.0
|
||||
mask_time_length: 10
|
||||
mask_time_min_masks: 2
|
||||
mask_time_min_space: 1
|
||||
mask_time_other: 0.0
|
||||
mask_time_prob: 0.075
|
||||
mask_time_selection: static
|
||||
num_codevectors_per_group: 320
|
||||
num_codevector_groups: 2
|
||||
contrastive_logits_temperature: 0.1
|
||||
num_negatives: 100
|
||||
codevector_dim: 256
|
||||
proj_codevector_dim: 256
|
||||
diversity_loss_weight: 0.1
|
||||
use_weighted_layer_sum: False
|
||||
# pad_token_id: 0
|
||||
# bos_token_id: 1
|
||||
# eos_token_id: 2
|
||||
add_adapter: False
|
||||
adapter_kernel_size: 3
|
||||
adapter_stride: 2
|
||||
num_adapter_layers: 3
|
||||
output_hidden_size: None
|
||||
|
||||
###########################################
|
||||
# Data #
|
||||
###########################################
|
||||
|
||||
train_manifest: data/manifest.train
|
||||
dev_manifest: data/manifest.dev
|
||||
test_manifest: data/manifest.test
|
||||
vocab_filepath: data/lang_char/vocab.txt
|
||||
|
||||
###########################################
|
||||
# Dataloader #
|
||||
###########################################
|
||||
|
||||
unit_type: 'char'
|
||||
tokenizer: bert-base-chinese
|
||||
mean_std_filepath:
|
||||
preprocess_config: conf/preprocess.yaml
|
||||
sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||
batch_size: 5 # Different batch_size may cause large differences in results
|
||||
maxlen_in: 51200000000 # if input length > maxlen-in batchsize is automatically reduced
|
||||
maxlen_out: 1500000 # if output length > maxlen-out batchsize is automatically reduced
|
||||
minibatches: 0 # for debug
|
||||
batch_count: auto
|
||||
batch_bins: 0
|
||||
batch_frames_in: 0
|
||||
batch_frames_out: 0
|
||||
batch_frames_inout: 0
|
||||
num_workers: 6
|
||||
subsampling_factor: 1
|
||||
num_encs: 1
|
||||
dist_sampler: True
|
||||
shortest_first: True
|
||||
return_lens_rate: True
|
||||
|
||||
###########################################
|
||||
# use speechbrain dataloader #
|
||||
###########################################
|
||||
use_sb_pipeline: True # whether use speechbrain pipeline. Default is True.
|
||||
sb_pipeline_conf: conf/train_with_wav2vec.yaml
|
||||
|
||||
###########################################
|
||||
# Training #
|
||||
###########################################
|
||||
n_epoch: 80
|
||||
accum_grad: 1
|
||||
global_grad_clip: 5.0
|
||||
|
||||
model_optim: adadelta
|
||||
model_optim_conf:
|
||||
lr: 1.0
|
||||
weight_decay: 0.0
|
||||
rho: 0.95
|
||||
epsilon: 1.0e-8
|
||||
|
||||
wav2vec2_optim: adam
|
||||
wav2vec2_optim_conf:
|
||||
lr: 0.0001
|
||||
weight_decay: 0.0
|
||||
|
||||
model_scheduler: newbobscheduler
|
||||
model_scheduler_conf:
|
||||
improvement_threshold: 0.0025
|
||||
annealing_factor: 0.8
|
||||
patient: 0
|
||||
wav2vec2_scheduler: newbobscheduler
|
||||
wav2vec2_scheduler_conf:
|
||||
improvement_threshold: 0.0025
|
||||
annealing_factor: 0.9
|
||||
patient: 0
|
||||
log_interval: 1
|
||||
checkpoint:
|
||||
kbest_n: 50
|
||||
latest_n: 5
|
@ -0,0 +1 @@
|
||||
../../tts3/local/paddle2onnx.sh
|
@ -0,0 +1,6 @@
|
||||
|
||||
# Opencpop
|
||||
|
||||
* svs1 - DiffSinger
|
||||
* voc1 - Parallel WaveGAN
|
||||
* voc5 - HiFiGAN
|
@ -0,0 +1,276 @@
|
||||
([简体中文](./README_cn.md)|English)
|
||||
# DiffSinger with Opencpop
|
||||
This example contains code used to train a [DiffSinger](https://arxiv.org/abs/2105.02446) model with [Mandarin singing corpus](https://wenet.org.cn/opencpop/).
|
||||
|
||||
## Dataset
|
||||
### Download and Extract
|
||||
Download Opencpop from it's [Official Website](https://wenet.org.cn/opencpop/download/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/Opencpop`.
|
||||
|
||||
## Get Started
|
||||
Assume the path to the dataset is `~/datasets/Opencpop`.
|
||||
Run the command below to
|
||||
1. **source path**.
|
||||
2. preprocess the dataset.
|
||||
3. train the model.
|
||||
4. synthesize wavs.
|
||||
- synthesize waveform from `metadata.jsonl`.
|
||||
- (Supporting) synthesize waveform from a text file.
|
||||
5. (Supporting) inference using the static model.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
|
||||
```bash
|
||||
./run.sh --stage 0 --stop-stage 0
|
||||
```
|
||||
### Data Preprocessing
|
||||
```bash
|
||||
./local/preprocess.sh ${conf_path}
|
||||
```
|
||||
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
|
||||
|
||||
```text
|
||||
dump
|
||||
├── dev
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
├── phone_id_map.txt
|
||||
├── speaker_id_map.txt
|
||||
├── test
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
└── train
|
||||
├── energy_stats.npy
|
||||
├── norm
|
||||
├── pitch_stats.npy
|
||||
├── raw
|
||||
├── speech_stats.npy
|
||||
└── speech_stretchs.npy
|
||||
|
||||
```
|
||||
The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech, pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. `speech_stretchs.npy` contains the minimum and maximum values of each dimension of the mel spectrum, which is used for linear stretching before training/inference of the diffusion module.
|
||||
Note: Since the training effect of non-norm features is due to norm, the features saved under `norm` are features that have not been normed.
|
||||
|
||||
|
||||
Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains utterance id, speaker id, phones, text_lengths, speech_lengths, phone durations, the path of speech features, the path of pitch features, the path of energy features, note, note durations, slur.
|
||||
|
||||
### Model Training
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
|
||||
```
|
||||
`./local/train.sh` calls `${BIN_DIR}/train.py`.
|
||||
Here's the complete help message.
|
||||
```text
|
||||
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
|
||||
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
|
||||
[--ngpu NGPU] [--phones-dict PHONES_DICT]
|
||||
[--speaker-dict SPEAKER_DICT] [--speech-stretchs SPEECH_STRETCHS]
|
||||
|
||||
Train a FastSpeech2 model.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG diffsinger config file.
|
||||
--train-metadata TRAIN_METADATA
|
||||
training data.
|
||||
--dev-metadata DEV_METADATA
|
||||
dev data.
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir.
|
||||
--ngpu NGPU if ngpu=0, use cpu.
|
||||
--phones-dict PHONES_DICT
|
||||
phone vocabulary file.
|
||||
--speaker-dict SPEAKER_DICT
|
||||
speaker id map file for multiple speaker model.
|
||||
--speech-stretchs SPEECH_STRETCHS
|
||||
min amd max mel for stretching.
|
||||
```
|
||||
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
|
||||
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
|
||||
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
|
||||
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
|
||||
5. `--phones-dict` is the path of the phone vocabulary file.
|
||||
6. `--speech-stretchs` is the path of mel's min-max data file.
|
||||
|
||||
### Synthesizing
|
||||
We use parallel wavegan as the neural vocoder.
|
||||
Download pretrained parallel wavegan model from [pwgan_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/pwgan_opencpop_ckpt_1.4.0.zip) and unzip it.
|
||||
```bash
|
||||
unzip pwgan_opencpop_ckpt_1.4.0.zip
|
||||
```
|
||||
Parallel WaveGAN checkpoint contains files listed below.
|
||||
```text
|
||||
pwgan_opencpop_ckpt_1.4.0.zip
|
||||
├── default.yaml # default config used to train parallel wavegan
|
||||
├── snapshot_iter_100000.pdz # model parameters of parallel wavegan
|
||||
└── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
|
||||
```
|
||||
`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
|
||||
```
|
||||
```text
|
||||
usage: synthesize.py [-h]
|
||||
[--am {diffsinger_opencpop}]
|
||||
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
|
||||
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
|
||||
[--voc {pwgan_opencpop}]
|
||||
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
|
||||
[--voc_stat VOC_STAT] [--ngpu NGPU]
|
||||
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
|
||||
[--speech_stretchs SPEECH_STRETCHS]
|
||||
|
||||
Synthesize with acoustic model & vocoder
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
|
||||
Choose acoustic model type of tts task.
|
||||
{diffsinger_opencpop} Choose acoustic model type of svs task.
|
||||
--am_config AM_CONFIG
|
||||
Config of acoustic model.
|
||||
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
|
||||
--am_stat AM_STAT mean and standard deviation used to normalize
|
||||
spectrogram when training acoustic model.
|
||||
--phones_dict PHONES_DICT
|
||||
phone vocabulary file.
|
||||
--tones_dict TONES_DICT
|
||||
tone vocabulary file.
|
||||
--speaker_dict SPEAKER_DICT
|
||||
speaker id map file.
|
||||
--voice-cloning VOICE_CLONING
|
||||
whether training voice cloning model.
|
||||
--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
|
||||
Choose vocoder type of tts task.
|
||||
{pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
|
||||
--voc_config VOC_CONFIG
|
||||
Config of voc.
|
||||
--voc_ckpt VOC_CKPT Checkpoint file of voc.
|
||||
--voc_stat VOC_STAT mean and standard deviation used to normalize
|
||||
spectrogram when training voc.
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
--test_metadata TEST_METADATA
|
||||
test metadata.
|
||||
--output_dir OUTPUT_DIR
|
||||
output dir.
|
||||
--speech-stretchs SPEECH_STRETCHS
|
||||
The min and max values of the mel spectrum, using on diffusion of diffsinger.
|
||||
```
|
||||
|
||||
`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file.
|
||||
`local/pinyin_to_phone.txt` comes from the readme of the opencpop dataset, indicating the mapping from pinyin to phonemes in opencpop.
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
|
||||
```
|
||||
```text
|
||||
usage: synthesize_e2e.py [-h]
|
||||
[--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
|
||||
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
|
||||
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
|
||||
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
|
||||
[--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
|
||||
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
|
||||
[--voc_stat VOC_STAT] [--lang LANG]
|
||||
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
|
||||
[--text TEXT] [--output_dir OUTPUT_DIR]
|
||||
[--pinyin_phone PINYIN_PHONE]
|
||||
[--speech_stretchs SPEECH_STRETCHS]
|
||||
|
||||
Synthesize with acoustic model & vocoder
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
|
||||
Choose acoustic model type of tts task.
|
||||
{diffsinger_opencpop} Choose acoustic model type of svs task.
|
||||
--am_config AM_CONFIG
|
||||
Config of acoustic model.
|
||||
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
|
||||
--am_stat AM_STAT mean and standard deviation used to normalize
|
||||
spectrogram when training acoustic model.
|
||||
--phones_dict PHONES_DICT
|
||||
phone vocabulary file.
|
||||
--speaker_dict SPEAKER_DICT
|
||||
speaker id map file.
|
||||
--spk_id SPK_ID spk id for multi speaker acoustic model
|
||||
--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
|
||||
Choose vocoder type of tts task.
|
||||
{pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
|
||||
--voc_config VOC_CONFIG
|
||||
Config of voc.
|
||||
--voc_ckpt VOC_CKPT Checkpoint file of voc.
|
||||
--voc_stat VOC_STAT mean and standard deviation used to normalize
|
||||
spectrogram when training voc.
|
||||
--lang LANG {zh, en, mix, canton} Choose language type of tts task.
|
||||
{sing} Choose language type of svs task.
|
||||
--inference_dir INFERENCE_DIR
|
||||
dir to save inference models
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
--text TEXT text to synthesize file, a 'utt_id sentence' pair per line for tts task.
|
||||
A '{ utt_id input_type (is word) text notes note_durs}' or '{utt_id input_type (is phoneme) phones notes note_durs is_slurs}' pair per line for svs task.
|
||||
--output_dir OUTPUT_DIR
|
||||
output dir.
|
||||
--pinyin_phone PINYIN_PHONE
|
||||
pinyin to phone map file, using on sing_frontend.
|
||||
--speech_stretchs SPEECH_STRETCHS
|
||||
The min and max values of the mel spectrum, using on diffusion of diffsinger.
|
||||
```
|
||||
1. `--am` is acoustic model type with the format {model_name}_{dataset}
|
||||
2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the diffsinger pretrained model.
|
||||
3. `--voc` is vocoder type with the format {model_name}_{dataset}
|
||||
4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
|
||||
5. `--lang` is language. `zh`, `en`, `mix` and `canton` for tts task. `sing` for tts task.
|
||||
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
|
||||
7. `--text` is the text file, which contains sentences to synthesize.
|
||||
8. `--output_dir` is the directory to save synthesized audio files.
|
||||
9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
|
||||
10. `--inference_dir` is the directory to save static models. If this line is not added, it will not be generated and saved as a static model.
|
||||
11. `--pinyin_phone` pinyin to phone map file, using on sing_frontend.
|
||||
12. `--speech_stretchs` The min and max values of the mel spectrum, using on diffusion of diffsinger.
|
||||
|
||||
Note: At present, the diffsinger model does not support dynamic to static, so do not add `--inference_dir`.
|
||||
|
||||
|
||||
## Pretrained Model
|
||||
Pretrained DiffSinger model:
|
||||
- [diffsinger_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/diffsinger_opencpop_ckpt_1.4.0.zip)
|
||||
|
||||
DiffSinger checkpoint contains files listed below.
|
||||
```text
|
||||
diffsinger_opencpop_ckpt_1.4.0.zip
|
||||
├── default.yaml # default config used to train diffsinger
|
||||
├── energy_stats.npy # statistics used to normalize energy when training diffsinger if norm is needed
|
||||
├── phone_id_map.txt # phone vocabulary file when training diffsinger
|
||||
├── pinyin_to_phone.txt # pinyin-to-phoneme mapping file when training diffsinger
|
||||
├── pitch_stats.npy # statistics used to normalize pitch when training diffsinger if norm is needed
|
||||
├── snapshot_iter_160000.pdz # model parameters of diffsinger
|
||||
├── speech_stats.npy # statistics used to normalize mel when training diffsinger if norm is needed
|
||||
└── speech_stretchs.npy # min and max values to use for mel spectral stretching before training diffusion
|
||||
|
||||
```
|
||||
|
||||
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_sing.txt` using pretrained diffsinger and parallel wavegan models.
|
||||
|
||||
```bash
|
||||
source path.sh
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=diffsinger_opencpop \
|
||||
--am_config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \
|
||||
--am_ckpt=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \
|
||||
--am_stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy \
|
||||
--voc=pwgan_opencpop \
|
||||
--voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
|
||||
--voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
|
||||
--voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
|
||||
--lang=sing \
|
||||
--text=${BIN_DIR}/../sentences_sing.txt \
|
||||
--output_dir=exp/default/test_e2e \
|
||||
--phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
|
||||
--pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \
|
||||
--speech_stretchs=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy
|
||||
|
||||
```
|
@ -0,0 +1,159 @@
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
|
||||
fs: 24000 # sr
|
||||
n_fft: 512 # FFT size (samples).
|
||||
n_shift: 128 # Hop size (samples). 12.5ms
|
||||
win_length: 512 # Window length (samples). 50ms
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
|
||||
# Only used for feats_type != raw
|
||||
|
||||
fmin: 30 # Minimum frequency of Mel basis.
|
||||
fmax: 12000 # Maximum frequency of Mel basis.
|
||||
n_mels: 80 # The number of mel basis.
|
||||
|
||||
# Only used for the model using pitch features (e.g. FastSpeech2)
|
||||
f0min: 80 # Minimum f0 for pitch extraction.
|
||||
f0max: 750 # Maximum f0 for pitch extraction.
|
||||
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 48 # batch size
|
||||
num_workers: 1 # number of gpu
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model:
|
||||
# music score related
|
||||
note_num: 300 # number of note
|
||||
is_slur_num: 2 # number of slur
|
||||
# fastspeech2 module options
|
||||
use_energy_pred: False # whether use energy predictor
|
||||
use_postnet: False # whether use postnet
|
||||
|
||||
# fastspeech2 module
|
||||
fastspeech2_params:
|
||||
adim: 256 # attention dimension
|
||||
aheads: 2 # number of attention heads
|
||||
elayers: 4 # number of encoder layers
|
||||
eunits: 1024 # number of encoder ff units
|
||||
dlayers: 4 # number of decoder layers
|
||||
dunits: 1024 # number of decoder ff units
|
||||
positionwise_layer_type: conv1d-linear # type of position-wise layer
|
||||
positionwise_conv_kernel_size: 9 # kernel size of position wise conv layer
|
||||
transformer_enc_dropout_rate: 0.1 # dropout rate for transformer encoder layer
|
||||
transformer_enc_positional_dropout_rate: 0.1 # dropout rate for transformer encoder positional encoding
|
||||
transformer_enc_attn_dropout_rate: 0.0 # dropout rate for transformer encoder attention layer
|
||||
transformer_activation_type: "gelu" # Activation function type in transformer.
|
||||
encoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
decoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
reduction_factor: 1 # reduction factor
|
||||
init_type: xavier_uniform # initialization type
|
||||
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
|
||||
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
|
||||
use_scaled_pos_enc: True # whether to use scaled positional encoding
|
||||
transformer_dec_dropout_rate: 0.1 # dropout rate for transformer decoder layer
|
||||
transformer_dec_positional_dropout_rate: 0.1 # dropout rate for transformer decoder positional encoding
|
||||
transformer_dec_attn_dropout_rate: 0.0 # dropout rate for transformer decoder attention layer
|
||||
duration_predictor_layers: 5 # number of layers of duration predictor
|
||||
duration_predictor_chans: 256 # number of channels of duration predictor
|
||||
duration_predictor_kernel_size: 3 # filter size of duration predictor
|
||||
duration_predictor_dropout_rate: 0.5 # dropout rate in energy predictor
|
||||
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
|
||||
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
|
||||
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
|
||||
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
|
||||
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
|
||||
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
|
||||
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
|
||||
energy_predictor_layers: 2 # number of conv layers in energy predictor
|
||||
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
|
||||
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
|
||||
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
|
||||
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
|
||||
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
|
||||
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
|
||||
postnet_layers: 5 # number of layers of postnet
|
||||
postnet_filts: 5 # filter size of conv layers in postnet
|
||||
postnet_chans: 256 # number of channels of conv layers in postnet
|
||||
postnet_dropout_rate: 0.5 # dropout rate for postnet
|
||||
|
||||
# denoiser module
|
||||
denoiser_params:
|
||||
in_channels: 80 # Number of channels of the input mel-spectrogram
|
||||
out_channels: 80 # Number of channels of the output mel-spectrogram
|
||||
kernel_size: 3 # Kernel size of the residual blocks inside
|
||||
layers: 20 # Number of residual blocks inside
|
||||
stacks: 5 # The number of groups to split the residual blocks into
|
||||
residual_channels: 256 # Residual channel of the residual blocks
|
||||
gate_channels: 512 # Gate channel of the residual blocks
|
||||
skip_channels: 256 # Skip channel of the residual blocks
|
||||
aux_channels: 256 # Auxiliary channel of the residual blocks
|
||||
dropout: 0.1 # Dropout of the residual blocks
|
||||
bias: True # Whether to use bias in residual blocks
|
||||
use_weight_norm: False # Whether to use weight norm in all convolutions
|
||||
init_type: "kaiming_normal" # Type of initialize weights of a neural network module
|
||||
|
||||
|
||||
diffusion_params:
|
||||
num_train_timesteps: 100 # The number of timesteps between the noise and the real during training
|
||||
beta_start: 0.0001 # beta start parameter for the scheduler
|
||||
beta_end: 0.06 # beta end parameter for the scheduler
|
||||
beta_schedule: "linear" # beta schedule parameter for the scheduler
|
||||
num_max_timesteps: 100 # The max timestep transition from real to noise
|
||||
stretch: True # whether to stretch before diffusion
|
||||
|
||||
|
||||
###########################################################
|
||||
# UPDATER SETTING #
|
||||
###########################################################
|
||||
fs2_updater:
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
|
||||
ds_updater:
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
# fastspeech2 optimizer
|
||||
fs2_optimizer:
|
||||
optim: adam # optimizer type
|
||||
learning_rate: 0.001 # learning rate
|
||||
|
||||
# diffusion optimizer
|
||||
ds_optimizer_params:
|
||||
beta1: 0.9
|
||||
beta2: 0.98
|
||||
weight_decay: 0.0
|
||||
|
||||
ds_scheduler_params:
|
||||
learning_rate: 0.001
|
||||
gamma: 0.5
|
||||
step_size: 50000
|
||||
ds_grad_norm: 1
|
||||
|
||||
|
||||
###########################################################
|
||||
# INTERVAL SETTING #
|
||||
###########################################################
|
||||
only_train_diffusion: True # Whether to freeze fastspeech2 parameters when training diffusion
|
||||
ds_train_start_steps: 160000 # Number of steps to start to train diffusion module.
|
||||
train_max_steps: 320000 # Number of training steps.
|
||||
save_interval_steps: 2000 # Interval steps to save checkpoint.
|
||||
eval_interval_steps: 2000 # Interval steps to evaluate the network.
|
||||
num_snapshots: 5
|
||||
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 10086
|
@ -0,0 +1,418 @@
|
||||
a|a
|
||||
ai|ai
|
||||
an|an
|
||||
ang|ang
|
||||
ao|ao
|
||||
ba|b a
|
||||
bai|b ai
|
||||
ban|b an
|
||||
bang|b ang
|
||||
bao|b ao
|
||||
bei|b ei
|
||||
ben|b en
|
||||
beng|b eng
|
||||
bi|b i
|
||||
bian|b ian
|
||||
biao|b iao
|
||||
bie|b ie
|
||||
bin|b in
|
||||
bing|b ing
|
||||
bo|b o
|
||||
bu|b u
|
||||
ca|c a
|
||||
cai|c ai
|
||||
can|c an
|
||||
cang|c ang
|
||||
cao|c ao
|
||||
ce|c e
|
||||
cei|c ei
|
||||
cen|c en
|
||||
ceng|c eng
|
||||
cha|ch a
|
||||
chai|ch ai
|
||||
chan|ch an
|
||||
chang|ch ang
|
||||
chao|ch ao
|
||||
che|ch e
|
||||
chen|ch en
|
||||
cheng|ch eng
|
||||
chi|ch i
|
||||
chong|ch ong
|
||||
chou|ch ou
|
||||
chu|ch u
|
||||
chua|ch ua
|
||||
chuai|ch uai
|
||||
chuan|ch uan
|
||||
chuang|ch uang
|
||||
chui|ch ui
|
||||
chun|ch un
|
||||
chuo|ch uo
|
||||
ci|c i
|
||||
cong|c ong
|
||||
cou|c ou
|
||||
cu|c u
|
||||
cuan|c uan
|
||||
cui|c ui
|
||||
cun|c un
|
||||
cuo|c uo
|
||||
da|d a
|
||||
dai|d ai
|
||||
dan|d an
|
||||
dang|d ang
|
||||
dao|d ao
|
||||
de|d e
|
||||
dei|d ei
|
||||
den|d en
|
||||
deng|d eng
|
||||
di|d i
|
||||
dia|d ia
|
||||
dian|d ian
|
||||
diao|d iao
|
||||
die|d ie
|
||||
ding|d ing
|
||||
diu|d iu
|
||||
dong|d ong
|
||||
dou|d ou
|
||||
du|d u
|
||||
duan|d uan
|
||||
dui|d ui
|
||||
dun|d un
|
||||
duo|d uo
|
||||
e|e
|
||||
ei|ei
|
||||
en|en
|
||||
eng|eng
|
||||
er|er
|
||||
fa|f a
|
||||
fan|f an
|
||||
fang|f ang
|
||||
fei|f ei
|
||||
fen|f en
|
||||
feng|f eng
|
||||
fo|f o
|
||||
fou|f ou
|
||||
fu|f u
|
||||
ga|g a
|
||||
gai|g ai
|
||||
gan|g an
|
||||
gang|g ang
|
||||
gao|g ao
|
||||
ge|g e
|
||||
gei|g ei
|
||||
gen|g en
|
||||
geng|g eng
|
||||
gong|g ong
|
||||
gou|g ou
|
||||
gu|g u
|
||||
gua|g ua
|
||||
guai|g uai
|
||||
guan|g uan
|
||||
guang|g uang
|
||||
gui|g ui
|
||||
gun|g un
|
||||
guo|g uo
|
||||
ha|h a
|
||||
hai|h ai
|
||||
han|h an
|
||||
hang|h ang
|
||||
hao|h ao
|
||||
he|h e
|
||||
hei|h ei
|
||||
hen|h en
|
||||
heng|h eng
|
||||
hm|h m
|
||||
hng|h ng
|
||||
hong|h ong
|
||||
hou|h ou
|
||||
hu|h u
|
||||
hua|h ua
|
||||
huai|h uai
|
||||
huan|h uan
|
||||
huang|h uang
|
||||
hui|h ui
|
||||
hun|h un
|
||||
huo|h uo
|
||||
ji|j i
|
||||
jia|j ia
|
||||
jian|j ian
|
||||
jiang|j iang
|
||||
jiao|j iao
|
||||
jie|j ie
|
||||
jin|j in
|
||||
jing|j ing
|
||||
jiong|j iong
|
||||
jiu|j iu
|
||||
ju|j v
|
||||
juan|j van
|
||||
jue|j ve
|
||||
jun|j vn
|
||||
ka|k a
|
||||
kai|k ai
|
||||
kan|k an
|
||||
kang|k ang
|
||||
kao|k ao
|
||||
ke|k e
|
||||
kei|k ei
|
||||
ken|k en
|
||||
keng|k eng
|
||||
kong|k ong
|
||||
kou|k ou
|
||||
ku|k u
|
||||
kua|k ua
|
||||
kuai|k uai
|
||||
kuan|k uan
|
||||
kuang|k uang
|
||||
kui|k ui
|
||||
kun|k un
|
||||
kuo|k uo
|
||||
la|l a
|
||||
lai|l ai
|
||||
lan|l an
|
||||
lang|l ang
|
||||
lao|l ao
|
||||
le|l e
|
||||
lei|l ei
|
||||
leng|l eng
|
||||
li|l i
|
||||
lia|l ia
|
||||
lian|l ian
|
||||
liang|l iang
|
||||
liao|l iao
|
||||
lie|l ie
|
||||
lin|l in
|
||||
ling|l ing
|
||||
liu|l iu
|
||||
lo|l o
|
||||
long|l ong
|
||||
lou|l ou
|
||||
lu|l u
|
||||
luan|l uan
|
||||
lun|l un
|
||||
luo|l uo
|
||||
lv|l v
|
||||
lve|l ve
|
||||
m|m
|
||||
ma|m a
|
||||
mai|m ai
|
||||
man|m an
|
||||
mang|m ang
|
||||
mao|m ao
|
||||
me|m e
|
||||
mei|m ei
|
||||
men|m en
|
||||
meng|m eng
|
||||
mi|m i
|
||||
mian|m ian
|
||||
miao|m iao
|
||||
mie|m ie
|
||||
min|m in
|
||||
ming|m ing
|
||||
miu|m iu
|
||||
mo|m o
|
||||
mou|m ou
|
||||
mu|m u
|
||||
n|n
|
||||
na|n a
|
||||
nai|n ai
|
||||
nan|n an
|
||||
nang|n ang
|
||||
nao|n ao
|
||||
ne|n e
|
||||
nei|n ei
|
||||
nen|n en
|
||||
neng|n eng
|
||||
ng|n g
|
||||
ni|n i
|
||||
nian|n ian
|
||||
niang|n iang
|
||||
niao|n iao
|
||||
nie|n ie
|
||||
nin|n in
|
||||
ning|n ing
|
||||
niu|n iu
|
||||
nong|n ong
|
||||
nou|n ou
|
||||
nu|n u
|
||||
nuan|n uan
|
||||
nun|n un
|
||||
nuo|n uo
|
||||
nv|n v
|
||||
nve|n ve
|
||||
o|o
|
||||
ou|ou
|
||||
pa|p a
|
||||
pai|p ai
|
||||
pan|p an
|
||||
pang|p ang
|
||||
pao|p ao
|
||||
pei|p ei
|
||||
pen|p en
|
||||
peng|p eng
|
||||
pi|p i
|
||||
pian|p ian
|
||||
piao|p iao
|
||||
pie|p ie
|
||||
pin|p in
|
||||
ping|p ing
|
||||
po|p o
|
||||
pou|p ou
|
||||
pu|p u
|
||||
qi|q i
|
||||
qia|q ia
|
||||
qian|q ian
|
||||
qiang|q iang
|
||||
qiao|q iao
|
||||
qie|q ie
|
||||
qin|q in
|
||||
qing|q ing
|
||||
qiong|q iong
|
||||
qiu|q iu
|
||||
qu|q v
|
||||
quan|q van
|
||||
que|q ve
|
||||
qun|q vn
|
||||
ran|r an
|
||||
rang|r ang
|
||||
rao|r ao
|
||||
re|r e
|
||||
ren|r en
|
||||
reng|r eng
|
||||
ri|r i
|
||||
rong|r ong
|
||||
rou|r ou
|
||||
ru|r u
|
||||
rua|r ua
|
||||
ruan|r uan
|
||||
rui|r ui
|
||||
run|r un
|
||||
ruo|r uo
|
||||
sa|s a
|
||||
sai|s ai
|
||||
san|s an
|
||||
sang|s ang
|
||||
sao|s ao
|
||||
se|s e
|
||||
sen|s en
|
||||
seng|s eng
|
||||
sha|sh a
|
||||
shai|sh ai
|
||||
shan|sh an
|
||||
shang|sh ang
|
||||
shao|sh ao
|
||||
she|sh e
|
||||
shei|sh ei
|
||||
shen|sh en
|
||||
sheng|sh eng
|
||||
shi|sh i
|
||||
shou|sh ou
|
||||
shu|sh u
|
||||
shua|sh ua
|
||||
shuai|sh uai
|
||||
shuan|sh uan
|
||||
shuang|sh uang
|
||||
shui|sh ui
|
||||
shun|sh un
|
||||
shuo|sh uo
|
||||
si|s i
|
||||
song|s ong
|
||||
sou|s ou
|
||||
su|s u
|
||||
suan|s uan
|
||||
sui|s ui
|
||||
sun|s un
|
||||
suo|s uo
|
||||
ta|t a
|
||||
tai|t ai
|
||||
tan|t an
|
||||
tang|t ang
|
||||
tao|t ao
|
||||
te|t e
|
||||
tei|t ei
|
||||
teng|t eng
|
||||
ti|t i
|
||||
tian|t ian
|
||||
tiao|t iao
|
||||
tie|t ie
|
||||
ting|t ing
|
||||
tong|t ong
|
||||
tou|t ou
|
||||
tu|t u
|
||||
tuan|t uan
|
||||
tui|t ui
|
||||
tun|t un
|
||||
tuo|t uo
|
||||
wa|w a
|
||||
wai|w ai
|
||||
wan|w an
|
||||
wang|w ang
|
||||
wei|w ei
|
||||
wen|w en
|
||||
weng|w eng
|
||||
wo|w o
|
||||
wu|w u
|
||||
xi|x i
|
||||
xia|x ia
|
||||
xian|x ian
|
||||
xiang|x iang
|
||||
xiao|x iao
|
||||
xie|x ie
|
||||
xin|x in
|
||||
xing|x ing
|
||||
xiong|x iong
|
||||
xiu|x iu
|
||||
xu|x v
|
||||
xuan|x van
|
||||
xue|x ve
|
||||
xun|x vn
|
||||
ya|y a
|
||||
yan|y an
|
||||
yang|y ang
|
||||
yao|y ao
|
||||
ye|y e
|
||||
yi|y i
|
||||
yin|y in
|
||||
ying|y ing
|
||||
yo|y o
|
||||
yong|y ong
|
||||
you|y ou
|
||||
yu|y v
|
||||
yuan|y van
|
||||
yue|y ve
|
||||
yun|y vn
|
||||
za|z a
|
||||
zai|z ai
|
||||
zan|z an
|
||||
zang|z ang
|
||||
zao|z ao
|
||||
ze|z e
|
||||
zei|z ei
|
||||
zen|z en
|
||||
zeng|z eng
|
||||
zha|zh a
|
||||
zhai|zh ai
|
||||
zhan|zh an
|
||||
zhang|zh ang
|
||||
zhao|zh ao
|
||||
zhe|zh e
|
||||
zhei|zh ei
|
||||
zhen|zh en
|
||||
zheng|zh eng
|
||||
zhi|zh i
|
||||
zhong|zh ong
|
||||
zhou|zh ou
|
||||
zhu|zh u
|
||||
zhua|zh ua
|
||||
zhuai|zh uai
|
||||
zhuan|zh uan
|
||||
zhuang|zh uang
|
||||
zhui|zh ui
|
||||
zhun|zh un
|
||||
zhuo|zh uo
|
||||
zi|z i
|
||||
zong|z ong
|
||||
zou|z ou
|
||||
zu|z u
|
||||
zuan|z uan
|
||||
zui|z ui
|
||||
zun|z un
|
||||
zuo|z uo
|
@ -0,0 +1,74 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
config_path=$1
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/preprocess.py \
|
||||
--dataset=opencpop \
|
||||
--rootdir=~/datasets/Opencpop/segments \
|
||||
--dumpdir=dump \
|
||||
--label-file=~/datasets/Opencpop/segments/transcriptions.txt \
|
||||
--config=${config_path} \
|
||||
--num-cpu=20 \
|
||||
--cut-sil=True
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="speech"
|
||||
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="pitch"
|
||||
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="energy"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize and covert phone/speaker to id, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--pitch-stats=dump/train/pitch_stats.npy \
|
||||
--energy-stats=dump/train/energy_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--pitch-stats=dump/train/pitch_stats.npy \
|
||||
--energy-stats=dump/train/energy_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--pitch-stats=dump/train/pitch_stats.npy \
|
||||
--energy-stats=dump/train/energy_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# Get feature(mel) extremum for diffusion stretch
|
||||
echo "Get feature(mel) extremum ..."
|
||||
python3 ${BIN_DIR}/get_minmax.py \
|
||||
--metadata=dump/train/norm/metadata.jsonl \
|
||||
--speech-stretchs=dump/train/speech_stretchs.npy
|
||||
fi
|
@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=diffsinger_opencpop \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_opencpop \
|
||||
--voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
|
||||
--voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
|
||||
--voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--speech_stretchs=dump/train/speech_stretchs.npy
|
||||
fi
|
||||
|
@ -0,0 +1,53 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=diffsinger_opencpop \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_opencpop \
|
||||
--voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
|
||||
--voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
|
||||
--voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
|
||||
--lang=sing \
|
||||
--text=${BIN_DIR}/../sentences_sing.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--speech_stretchs=dump/train/speech_stretchs.npy \
|
||||
--pinyin_phone=local/pinyin_to_phone.txt
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# hifigan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "in hifigan syn_e2e"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=diffsinger_opencpop \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=hifigan_opencpop \
|
||||
--voc_config=hifigan_opencpop_ckpt_1.4.0/default.yaml \
|
||||
--voc_ckpt=hifigan_opencpop_ckpt_1.4.0/snapshot_iter_625000.pdz \
|
||||
--voc_stat=hifigan_opencpop_ckpt_1.4.0/feats_stats.npy \
|
||||
--lang=sing \
|
||||
--text=${BIN_DIR}/../sentences_sing.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--speech_stretchs=dump/train/speech_stretchs.npy \
|
||||
--pinyin_phone=local/pinyin_to_phone.txt
|
||||
|
||||
fi
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=1 \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speech-stretchs=dump/train/speech_stretchs.npy
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=diffsinger
|
||||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
|
@ -0,0 +1,37 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_320000.pdz
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize, vocoder is pwgan by default
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# synthesize_e2e, vocoder is pwgan by default
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
@ -0,0 +1,139 @@
|
||||
# Parallel WaveGAN with Opencpop
|
||||
This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [Mandarin singing corpus](https://wenet.org.cn/opencpop/).
|
||||
|
||||
## Dataset
|
||||
### Download and Extract
|
||||
Download Opencpop from it's [Official Website](https://wenet.org.cn/opencpop/download/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/Opencpop`.
|
||||
|
||||
## Get Started
|
||||
Assume the path to the dataset is `~/datasets/Opencpop`.
|
||||
Run the command below to
|
||||
1. **source path**.
|
||||
2. preprocess the dataset.
|
||||
3. train the model.
|
||||
4. synthesize wavs.
|
||||
- synthesize waveform from `metadata.jsonl`.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
|
||||
```bash
|
||||
./run.sh --stage 0 --stop-stage 0
|
||||
```
|
||||
### Data Preprocessing
|
||||
```bash
|
||||
./local/preprocess.sh ${conf_path}
|
||||
```
|
||||
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
|
||||
|
||||
```text
|
||||
dump
|
||||
├── dev
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
├── test
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
└── train
|
||||
├── norm
|
||||
├── raw
|
||||
└── feats_stats.npy
|
||||
```
|
||||
The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
|
||||
|
||||
Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
|
||||
|
||||
### Model Training
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
|
||||
```
|
||||
`./local/train.sh` calls `${BIN_DIR}/train.py`.
|
||||
Here's the complete help message.
|
||||
|
||||
```text
|
||||
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
|
||||
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
|
||||
[--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
|
||||
[--run-benchmark RUN_BENCHMARK]
|
||||
[--profiler_options PROFILER_OPTIONS]
|
||||
|
||||
Train a ParallelWaveGAN model.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG ParallelWaveGAN config file.
|
||||
--train-metadata TRAIN_METADATA
|
||||
training data.
|
||||
--dev-metadata DEV_METADATA
|
||||
dev data.
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir.
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
|
||||
benchmark:
|
||||
arguments related to benchmark.
|
||||
|
||||
--batch-size BATCH_SIZE
|
||||
batch size.
|
||||
--max-iter MAX_ITER train max steps.
|
||||
--run-benchmark RUN_BENCHMARK
|
||||
runing benchmark or not, if True, use the --batch-size
|
||||
and --max-iter.
|
||||
--profiler_options PROFILER_OPTIONS
|
||||
The option of profiler, which should be in format
|
||||
"key1=value1;key2=value2;key3=value3".
|
||||
```
|
||||
|
||||
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
|
||||
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
|
||||
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
|
||||
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
|
||||
|
||||
### Synthesizing
|
||||
`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
|
||||
```
|
||||
```text
|
||||
usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
|
||||
[--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
|
||||
[--output-dir OUTPUT_DIR] [--ngpu NGPU]
|
||||
|
||||
Synthesize with GANVocoder.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--generator-type GENERATOR_TYPE
|
||||
type of GANVocoder, should in {pwgan, mb_melgan,
|
||||
style_melgan, } now
|
||||
--config CONFIG GANVocoder config file.
|
||||
--checkpoint CHECKPOINT
|
||||
snapshot to load.
|
||||
--test-metadata TEST_METADATA
|
||||
dev data.
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir.
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
```
|
||||
|
||||
1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
|
||||
2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
|
||||
3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
|
||||
4. `--output-dir` is the directory to save the synthesized audio files.
|
||||
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
|
||||
|
||||
## Pretrained Models
|
||||
The pretrained model can be downloaded here:
|
||||
- [pwgan_opencpop_ckpt_1.4.0](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/pwgan_opencpop_ckpt_1.4.0.zip)
|
||||
|
||||
|
||||
Parallel WaveGAN checkpoint contains files listed below.
|
||||
|
||||
```text
|
||||
pwgan_opencpop_ckpt_1.4.0
|
||||
├── default.yaml # default config used to train parallel wavegan
|
||||
├── snapshot_iter_100000.pdz # generator parameters of parallel wavegan
|
||||
└── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
|
||||
```
|
||||
## Acknowledgement
|
||||
We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
|
@ -0,0 +1,119 @@
|
||||
# This is the hyperparameter configuration file for Parallel WaveGAN.
|
||||
# Please make sure this is adjusted for the CSMSC dataset. If you want to
|
||||
# apply to the other dataset, you might need to carefully change some parameters.
|
||||
# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
|
||||
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
fs: 24000 # Sampling rate.
|
||||
n_fft: 512 # FFT size (samples).
|
||||
n_shift: 128 # Hop size (samples). 12.5ms
|
||||
win_length: 512 # Window length (samples). 50ms
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 30 # Minimum freq in mel basis calculation. (Hz)
|
||||
fmax: 12000 # Maximum frequency in mel basis calculation. (Hz)
|
||||
|
||||
|
||||
###########################################################
|
||||
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
generator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_size: 3 # Kernel size of dilated convolution.
|
||||
layers: 30 # Number of residual block layers.
|
||||
stacks: 3 # Number of stacks i.e., dilation cycles.
|
||||
residual_channels: 64 # Number of channels in residual conv.
|
||||
gate_channels: 128 # Number of channels in gated conv.
|
||||
skip_channels: 64 # Number of channels in skip conv.
|
||||
aux_channels: 80 # Number of channels for auxiliary feature conv.
|
||||
# Must be the same as num_mels.
|
||||
aux_context_window: 2 # Context window size for auxiliary feature.
|
||||
# If set to 2, previous 2 and future 2 frames will be considered.
|
||||
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
|
||||
bias: True # use bias in residual blocks
|
||||
use_weight_norm: True # Whether to use weight norm.
|
||||
# If set to true, it will be applied to all of the conv layers.
|
||||
use_causal_conv: False # use causal conv in residual blocks and upsample layers
|
||||
upsample_scales: [8, 4, 2, 2] # Upsampling scales. Prodcut of these must be the same as hop size.
|
||||
interpolate_mode: "nearest" # upsample net interpolate mode
|
||||
freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis
|
||||
nonlinear_activation: null
|
||||
nonlinear_activation_params: {}
|
||||
|
||||
###########################################################
|
||||
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
discriminator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_size: 3 # Number of output channels.
|
||||
layers: 10 # Number of conv layers.
|
||||
conv_channels: 64 # Number of chnn layers.
|
||||
bias: True # Whether to use bias parameter in conv.
|
||||
use_weight_norm: True # Whether to use weight norm.
|
||||
# If set to true, it will be applied to all of the conv layers.
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
|
||||
nonlinear_activation_params: # Nonlinear function parameters
|
||||
negative_slope: 0.2 # Alpha in leakyrelu.
|
||||
|
||||
###########################################################
|
||||
# STFT LOSS SETTING #
|
||||
###########################################################
|
||||
stft_loss_params:
|
||||
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
||||
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
||||
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
||||
window: "hann" # Window function for STFT-based loss
|
||||
|
||||
###########################################################
|
||||
# ADVERSARIAL LOSS SETTING #
|
||||
###########################################################
|
||||
lambda_adv: 4.0 # Loss balancing coefficient.
|
||||
|
||||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
batch_size: 8 # Batch size.
|
||||
batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by n_shift.
|
||||
num_workers: 1 # Number of workers in DataLoader.
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER & SCHEDULER SETTING #
|
||||
###########################################################
|
||||
generator_optimizer_params:
|
||||
epsilon: 1.0e-6 # Generator's epsilon.
|
||||
weight_decay: 0.0 # Generator's weight decay coefficient.
|
||||
generator_scheduler_params:
|
||||
learning_rate: 0.0001 # Generator's learning rate.
|
||||
step_size: 200000 # Generator's scheduler step size.
|
||||
gamma: 0.5 # Generator's scheduler gamma.
|
||||
# At each step size, lr will be multiplied by this parameter.
|
||||
generator_grad_norm: 10 # Generator's gradient norm.
|
||||
discriminator_optimizer_params:
|
||||
epsilon: 1.0e-6 # Discriminator's epsilon.
|
||||
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
||||
discriminator_scheduler_params:
|
||||
learning_rate: 0.00005 # Discriminator's learning rate.
|
||||
step_size: 200000 # Discriminator's scheduler step size.
|
||||
gamma: 0.5 # Discriminator's scheduler gamma.
|
||||
# At each step size, lr will be multiplied by this parameter.
|
||||
discriminator_grad_norm: 1 # Discriminator's gradient norm.
|
||||
|
||||
###########################################################
|
||||
# INTERVAL SETTING #
|
||||
###########################################################
|
||||
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
|
||||
train_max_steps: 400000 # Number of training steps.
|
||||
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
||||
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1 @@
|
||||
../../../csmsc/voc1/local/PTQ_static.sh
|
@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../../dygraph_to_static.py \
|
||||
--type=voc \
|
||||
--voc=pwgan_opencpop \
|
||||
--voc_config=${config_path} \
|
||||
--voc_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--voc_stat=dump/train/feats_stats.npy \
|
||||
--inference_dir=exp/default/inference/
|
@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
config_path=$1
|
||||
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/../preprocess.py \
|
||||
--rootdir=~/datasets/Opencpop/segments/ \
|
||||
--dataset=opencpop \
|
||||
--dumpdir=dump \
|
||||
--dur-file=~/datasets/Opencpop/segments/transcriptions.txt \
|
||||
--config=${config_path} \
|
||||
--cut-sil=False \
|
||||
--num-cpu=20
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="feats"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
fi
|
@ -0,0 +1 @@
|
||||
../../../csmsc/voc1/local/synthesize.sh
|
@ -0,0 +1 @@
|
||||
../../../csmsc/voc1/local/train.sh
|
@ -0,0 +1 @@
|
||||
../../csmsc/voc1/path.sh
|
@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_100000.pdz
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
# dygraph to static
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/dygraph_to_static.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
# PTQ_static
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} pwgan_opencpop || exit -1
|
||||
fi
|
@ -0,0 +1,167 @@
|
||||
# This is the configuration file for CSMSC dataset.
|
||||
# This configuration is based on HiFiGAN V1, which is an official configuration.
|
||||
# But I found that the optimizer setting does not work well with my implementation.
|
||||
# So I changed optimizer settings as follows:
|
||||
# - AdamW -> Adam
|
||||
# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
|
||||
# - Scheduler: ExponentialLR -> MultiStepLR
|
||||
# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting.
|
||||
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
fs: 24000 # Sampling rate.
|
||||
n_fft: 512 # FFT size (samples).
|
||||
n_shift: 128 # Hop size (samples). 12.5ms
|
||||
win_length: 512 # Window length (samples). 50ms
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
|
||||
fmax: 12000 # Maximum frequency in mel basis calculation. (Hz)
|
||||
|
||||
###########################################################
|
||||
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
generator_params:
|
||||
in_channels: 80 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
channels: 512 # Number of initial channels.
|
||||
kernel_size: 7 # Kernel size of initial and final conv layers.
|
||||
upsample_scales: [8, 4, 2, 2] # Upsampling scales.
|
||||
upsample_kernel_sizes: [16, 8, 4, 4] # Kernel size for upsampling layers.
|
||||
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
|
||||
resblock_dilations: # Dilations for residual blocks.
|
||||
- [1, 3, 5]
|
||||
- [1, 3, 5]
|
||||
- [1, 3, 5]
|
||||
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
|
||||
bias: True # Whether to use bias parameter in conv.
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
|
||||
nonlinear_activation_params: # Nonlinear activation paramters.
|
||||
negative_slope: 0.1
|
||||
use_weight_norm: True # Whether to apply weight normalization.
|
||||
|
||||
|
||||
###########################################################
|
||||
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
discriminator_params:
|
||||
scales: 3 # Number of multi-scale discriminator.
|
||||
scale_downsample_pooling: "AvgPool1D" # Pooling operation for scale discriminator.
|
||||
scale_downsample_pooling_params:
|
||||
kernel_size: 4 # Pooling kernel size.
|
||||
stride: 2 # Pooling stride.
|
||||
padding: 2 # Padding size.
|
||||
scale_discriminator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
|
||||
channels: 128 # Initial number of channels.
|
||||
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
||||
max_groups: 16 # Maximum number of groups in downsampling conv layers.
|
||||
bias: True
|
||||
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation.
|
||||
nonlinear_activation_params:
|
||||
negative_slope: 0.1
|
||||
follow_official_norm: True # Whether to follow the official norm setting.
|
||||
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
|
||||
period_discriminator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_sizes: [5, 3] # List of kernel sizes.
|
||||
channels: 32 # Initial number of channels.
|
||||
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
|
||||
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
||||
bias: True # Whether to use bias parameter in conv layer."
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation.
|
||||
nonlinear_activation_params: # Nonlinear activation paramters.
|
||||
negative_slope: 0.1
|
||||
use_weight_norm: True # Whether to apply weight normalization.
|
||||
use_spectral_norm: False # Whether to apply spectral normalization.
|
||||
|
||||
|
||||
###########################################################
|
||||
# STFT LOSS SETTING #
|
||||
###########################################################
|
||||
use_stft_loss: False # Whether to use multi-resolution STFT loss.
|
||||
use_mel_loss: True # Whether to use Mel-spectrogram loss.
|
||||
mel_loss_params:
|
||||
fs: 24000
|
||||
fft_size: 512
|
||||
hop_size: 128
|
||||
win_length: 512
|
||||
window: "hann"
|
||||
num_mels: 80
|
||||
fmin: 30
|
||||
fmax: 12000
|
||||
log_base: null
|
||||
generator_adv_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
discriminator_adv_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
use_feat_match_loss: True
|
||||
feat_match_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
average_by_layers: False # Whether to average loss by #layers in each discriminator.
|
||||
include_final_outputs: False # Whether to include final outputs in feat match loss calculation.
|
||||
|
||||
###########################################################
|
||||
# ADVERSARIAL LOSS SETTING #
|
||||
###########################################################
|
||||
lambda_aux: 45.0 # Loss balancing coefficient for STFT loss.
|
||||
lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss.
|
||||
lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
|
||||
|
||||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
batch_size: 16 # Batch size.
|
||||
batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||
num_workers: 1 # Number of workers in DataLoader.
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER & SCHEDULER SETTING #
|
||||
###########################################################
|
||||
generator_optimizer_params:
|
||||
beta1: 0.5
|
||||
beta2: 0.9
|
||||
weight_decay: 0.0 # Generator's weight decay coefficient.
|
||||
generator_scheduler_params:
|
||||
learning_rate: 2.0e-4 # Generator's learning rate.
|
||||
gamma: 0.5 # Generator's scheduler gamma.
|
||||
milestones: # At each milestone, lr will be multiplied by gamma.
|
||||
- 200000
|
||||
- 400000
|
||||
- 600000
|
||||
- 800000
|
||||
generator_grad_norm: -1 # Generator's gradient norm.
|
||||
discriminator_optimizer_params:
|
||||
beta1: 0.5
|
||||
beta2: 0.9
|
||||
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
||||
discriminator_scheduler_params:
|
||||
learning_rate: 2.0e-4 # Discriminator's learning rate.
|
||||
gamma: 0.5 # Discriminator's scheduler gamma.
|
||||
milestones: # At each milestone, lr will be multiplied by gamma.
|
||||
- 200000
|
||||
- 400000
|
||||
- 600000
|
||||
- 800000
|
||||
discriminator_grad_norm: -1 # Discriminator's gradient norm.
|
||||
|
||||
###########################################################
|
||||
# INTERVAL SETTING #
|
||||
###########################################################
|
||||
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
|
||||
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
|
||||
train_max_steps: 2500000 # Number of training steps.
|
||||
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
||||
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_snapshots: 4 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,168 @@
|
||||
# This is the configuration file for CSMSC dataset.
|
||||
# This configuration is based on HiFiGAN V1, which is an official configuration.
|
||||
# But I found that the optimizer setting does not work well with my implementation.
|
||||
# So I changed optimizer settings as follows:
|
||||
# - AdamW -> Adam
|
||||
# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
|
||||
# - Scheduler: ExponentialLR -> MultiStepLR
|
||||
# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting.
|
||||
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
fs: 24000 # Sampling rate.
|
||||
n_fft: 512 # FFT size (samples).
|
||||
n_shift: 128 # Hop size (samples). 12.5ms
|
||||
win_length: 512 # Window length (samples). 50ms
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
|
||||
fmax: 12000 # Maximum frequency in mel basis calculation. (Hz)
|
||||
|
||||
###########################################################
|
||||
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
generator_params:
|
||||
in_channels: 80 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
channels: 512 # Number of initial channels.
|
||||
kernel_size: 7 # Kernel size of initial and final conv layers.
|
||||
upsample_scales: [8, 4, 2, 2] # Upsampling scales.
|
||||
upsample_kernel_sizes: [16, 8, 4, 4] # Kernel size for upsampling layers.
|
||||
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
|
||||
resblock_dilations: # Dilations for residual blocks.
|
||||
- [1, 3, 5]
|
||||
- [1, 3, 5]
|
||||
- [1, 3, 5]
|
||||
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
|
||||
bias: True # Whether to use bias parameter in conv.
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
|
||||
nonlinear_activation_params: # Nonlinear activation paramters.
|
||||
negative_slope: 0.1
|
||||
use_weight_norm: True # Whether to apply weight normalization.
|
||||
|
||||
|
||||
###########################################################
|
||||
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
discriminator_params:
|
||||
scales: 3 # Number of multi-scale discriminator.
|
||||
scale_downsample_pooling: "AvgPool1D" # Pooling operation for scale discriminator.
|
||||
scale_downsample_pooling_params:
|
||||
kernel_size: 4 # Pooling kernel size.
|
||||
stride: 2 # Pooling stride.
|
||||
padding: 2 # Padding size.
|
||||
scale_discriminator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
|
||||
channels: 128 # Initial number of channels.
|
||||
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
||||
max_groups: 16 # Maximum number of groups in downsampling conv layers.
|
||||
bias: True
|
||||
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation.
|
||||
nonlinear_activation_params:
|
||||
negative_slope: 0.1
|
||||
follow_official_norm: True # Whether to follow the official norm setting.
|
||||
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
|
||||
period_discriminator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_sizes: [5, 3] # List of kernel sizes.
|
||||
channels: 32 # Initial number of channels.
|
||||
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
|
||||
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
||||
bias: True # Whether to use bias parameter in conv layer."
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation.
|
||||
nonlinear_activation_params: # Nonlinear activation paramters.
|
||||
negative_slope: 0.1
|
||||
use_weight_norm: True # Whether to apply weight normalization.
|
||||
use_spectral_norm: False # Whether to apply spectral normalization.
|
||||
|
||||
|
||||
###########################################################
|
||||
# STFT LOSS SETTING #
|
||||
###########################################################
|
||||
use_stft_loss: False # Whether to use multi-resolution STFT loss.
|
||||
use_mel_loss: True # Whether to use Mel-spectrogram loss.
|
||||
mel_loss_params:
|
||||
fs: 24000
|
||||
fft_size: 512
|
||||
hop_size: 128
|
||||
win_length: 512
|
||||
window: "hann"
|
||||
num_mels: 80
|
||||
fmin: 30
|
||||
fmax: 12000
|
||||
log_base: null
|
||||
generator_adv_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
discriminator_adv_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
use_feat_match_loss: True
|
||||
feat_match_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
average_by_layers: False # Whether to average loss by #layers in each discriminator.
|
||||
include_final_outputs: False # Whether to include final outputs in feat match loss calculation.
|
||||
|
||||
###########################################################
|
||||
# ADVERSARIAL LOSS SETTING #
|
||||
###########################################################
|
||||
lambda_aux: 45.0 # Loss balancing coefficient for STFT loss.
|
||||
lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss.
|
||||
lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
|
||||
|
||||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
#batch_size: 16 # Batch size.
|
||||
batch_size: 1 # Batch size.
|
||||
batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||
num_workers: 1 # Number of workers in DataLoader.
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER & SCHEDULER SETTING #
|
||||
###########################################################
|
||||
generator_optimizer_params:
|
||||
beta1: 0.5
|
||||
beta2: 0.9
|
||||
weight_decay: 0.0 # Generator's weight decay coefficient.
|
||||
generator_scheduler_params:
|
||||
learning_rate: 2.0e-4 # Generator's learning rate.
|
||||
gamma: 0.5 # Generator's scheduler gamma.
|
||||
milestones: # At each milestone, lr will be multiplied by gamma.
|
||||
- 200000
|
||||
- 400000
|
||||
- 600000
|
||||
- 800000
|
||||
generator_grad_norm: -1 # Generator's gradient norm.
|
||||
discriminator_optimizer_params:
|
||||
beta1: 0.5
|
||||
beta2: 0.9
|
||||
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
||||
discriminator_scheduler_params:
|
||||
learning_rate: 2.0e-4 # Discriminator's learning rate.
|
||||
gamma: 0.5 # Discriminator's scheduler gamma.
|
||||
milestones: # At each milestone, lr will be multiplied by gamma.
|
||||
- 200000
|
||||
- 400000
|
||||
- 600000
|
||||
- 800000
|
||||
discriminator_grad_norm: -1 # Discriminator's gradient norm.
|
||||
|
||||
###########################################################
|
||||
# INTERVAL SETTING #
|
||||
###########################################################
|
||||
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
|
||||
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
|
||||
train_max_steps: 2600000 # Number of training steps.
|
||||
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
||||
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_snapshots: 4 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,74 @@
|
||||
#!/bin/bash
|
||||
|
||||
source path.sh
|
||||
|
||||
gpus=0
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py \
|
||||
--diffsinger-config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \
|
||||
--diffsinger-checkpoint=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \
|
||||
--diffsinger-stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy \
|
||||
--diffsinger-stretch=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy \
|
||||
--dur-file=~/datasets/Opencpop/segments/transcriptions.txt \
|
||||
--output-dir=dump_finetune \
|
||||
--phones-dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
|
||||
--dataset=opencpop \
|
||||
--rootdir=~/datasets/Opencpop/segments/
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
python3 ${MAIN_ROOT}/utils/link_wav.py \
|
||||
--old-dump-dir=dump \
|
||||
--dump-dir=dump_finetune
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
cp dump/train/feats_stats.npy dump_finetune/train/
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump_finetune/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump_finetune/train/norm \
|
||||
--stats=dump_finetune/train/feats_stats.npy
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump_finetune/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump_finetune/dev/norm \
|
||||
--stats=dump_finetune/train/feats_stats.npy
|
||||
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump_finetune/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump_finetune/test/norm \
|
||||
--stats=dump_finetune/train/feats_stats.npy
|
||||
fi
|
||||
|
||||
# create finetune env
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
echo "create finetune env"
|
||||
python3 local/prepare_env.py \
|
||||
--pretrained_model_dir=exp/default/checkpoints/ \
|
||||
--output_dir=exp/finetune/
|
||||
fi
|
||||
|
||||
# finetune
|
||||
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} \
|
||||
FLAGS_cudnn_exhaustive_search=true \
|
||||
FLAGS_conv_workspace_size_limit=4000 \
|
||||
python ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump_finetune/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
|
||||
--config=conf/finetune.yaml \
|
||||
--output-dir=exp/finetune \
|
||||
--ngpu=1
|
||||
fi
|
@ -0,0 +1 @@
|
||||
../../../csmsc/voc1/local/PTQ_static.sh
|
@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../../dygraph_to_static.py \
|
||||
--type=voc \
|
||||
--voc=hifigan_opencpop \
|
||||
--voc_config=${config_path} \
|
||||
--voc_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--voc_stat=dump/train/feats_stats.npy \
|
||||
--inference_dir=exp/default/inference/
|
@ -0,0 +1 @@
|
||||
../../../other/tts_finetune/tts3/local/prepare_env.py
|
@ -0,0 +1 @@
|
||||
../../voc1/local/preprocess.sh
|
@ -0,0 +1 @@
|
||||
../../../csmsc/voc5/local/synthesize.sh
|
@ -0,0 +1 @@
|
||||
../../../csmsc/voc1/local/train.sh
|
@ -0,0 +1 @@
|
||||
../../csmsc/voc5/path.sh
|
@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_2500000.pdz
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
# dygraph to static
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/dygraph_to_static.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
# PTQ_static
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} hifigan_opencpop || exit -1
|
||||
fi
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue