|
|
@ -252,7 +252,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 18,
|
|
|
|
"execution_count": 25,
|
|
|
|
"metadata": {
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
"scrolled": true
|
|
|
|
},
|
|
|
|
},
|
|
|
@ -261,8 +261,7 @@
|
|
|
|
"name": "stdout",
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"text": [
|
|
|
|
"The autoreload extension is already loaded. To reload it, use:\n",
|
|
|
|
"env: CUDA_VISIBLE_DEVICES=0\n"
|
|
|
|
" %reload_ext autoreload\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
],
|
|
|
|
],
|
|
|
@ -284,7 +283,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 19,
|
|
|
|
"execution_count": 28,
|
|
|
|
"metadata": {
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
"scrolled": true
|
|
|
|
},
|
|
|
|
},
|
|
|
@ -317,7 +316,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 20,
|
|
|
|
"execution_count": 30,
|
|
|
|
"metadata": {
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
"scrolled": true
|
|
|
|
},
|
|
|
|
},
|
|
|
@ -596,11 +595,19 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 32,
|
|
|
|
"execution_count": 31,
|
|
|
|
"metadata": {
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
"scrolled": true
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"Frontend done!\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
|
"# 传入 phones_dict 会把相应的 phones 转换成 phone_ids\n",
|
|
|
|
"# 传入 phones_dict 会把相应的 phones 转换成 phone_ids\n",
|
|
|
|
"frontend = Frontend(phone_vocab_path=phones_dict)\n",
|
|
|
|
"frontend = Frontend(phone_vocab_path=phones_dict)\n",
|
|
|
@ -619,25 +626,11 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 23,
|
|
|
|
"execution_count": 35,
|
|
|
|
"metadata": {
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
"scrolled": true
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"outputs": [
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"Building prefix dict from the default dictionary ...\n",
|
|
|
|
|
|
|
|
"DEBUG:jieba:Building prefix dict from the default dictionary ...\n",
|
|
|
|
|
|
|
|
"Loading model from cache /tmp/jieba.cache\n",
|
|
|
|
|
|
|
|
"DEBUG:jieba:Loading model from cache /tmp/jieba.cache\n",
|
|
|
|
|
|
|
|
"Loading model cost 5.331 seconds.\n",
|
|
|
|
|
|
|
|
"DEBUG:jieba:Loading model cost 5.331 seconds.\n",
|
|
|
|
|
|
|
|
"Prefix dict has been built successfully.\n",
|
|
|
|
|
|
|
|
"DEBUG:jieba:Prefix dict has been built successfully.\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"output_type": "stream",
|
|
|
@ -701,8 +694,10 @@
|
|
|
|
"<br></br>\n",
|
|
|
|
"<br></br>\n",
|
|
|
|
"在本教程中,我们使用 `FastSpeech2` 作为声学模型。\n",
|
|
|
|
"在本教程中,我们使用 `FastSpeech2` 作为声学模型。\n",
|
|
|
|
"![FastSpeech2](source/fastspeech2.png)\n",
|
|
|
|
"![FastSpeech2](source/fastspeech2.png)\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
"PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于,我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)。\n",
|
|
|
|
"PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于,我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)。\n",
|
|
|
|
"![FastPitch](source/fastpitch.png)\n",
|
|
|
|
"![FastPitch](source/fastpitch.png)\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
"更多关于[声学模型的发展及改进](https://paddlespeech.readthedocs.io/en/latest/tts/models_introduction.html)。"
|
|
|
|
"更多关于[声学模型的发展及改进](https://paddlespeech.readthedocs.io/en/latest/tts/models_introduction.html)。"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
},
|
|
|
|
},
|
|
|
@ -1020,13 +1015,16 @@
|
|
|
|
"odim = fastspeech2_config.n_mels\n",
|
|
|
|
"odim = fastspeech2_config.n_mels\n",
|
|
|
|
"model = FastSpeech2(\n",
|
|
|
|
"model = FastSpeech2(\n",
|
|
|
|
" idim=vocab_size, odim=odim, **fastspeech2_config[\"model\"])\n",
|
|
|
|
" idim=vocab_size, odim=odim, **fastspeech2_config[\"model\"])\n",
|
|
|
|
"\n",
|
|
|
|
"# 加载预训练模型参数\n",
|
|
|
|
"model.set_state_dict(paddle.load(fastspeech2_checkpoint)[\"main_params\"]) # 加载预训练模型参数\n",
|
|
|
|
"model.set_state_dict(paddle.load(fastspeech2_checkpoint)[\"main_params\"])\n",
|
|
|
|
"model.eval() # 推理阶段不启用 batch norm 和 dropout\n",
|
|
|
|
"# 推理阶段不启用 batch norm 和 dropout\n",
|
|
|
|
|
|
|
|
"model.eval()\n",
|
|
|
|
"stat = np.load(fastspeech2_stat)\n",
|
|
|
|
"stat = np.load(fastspeech2_stat)\n",
|
|
|
|
"mu, std = stat # 读取数据预处理阶段数据集的均值和标准差\n",
|
|
|
|
"# 读取数据预处理阶段数据集的均值和标准差\n",
|
|
|
|
|
|
|
|
"mu, std = stat\n",
|
|
|
|
"mu, std = paddle.to_tensor(mu), paddle.to_tensor(std)\n",
|
|
|
|
"mu, std = paddle.to_tensor(mu), paddle.to_tensor(std)\n",
|
|
|
|
"fastspeech2_normalizer = ZScore(mu, std) # 构造归一化的新模型\n",
|
|
|
|
"# 构造归一化的新模型\n",
|
|
|
|
|
|
|
|
"fastspeech2_normalizer = ZScore(mu, std)\n",
|
|
|
|
"fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)\n",
|
|
|
|
"fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)\n",
|
|
|
|
"fastspeech2_inference.eval()\n",
|
|
|
|
"fastspeech2_inference.eval()\n",
|
|
|
|
"print(fastspeech2_inference)\n",
|
|
|
|
"print(fastspeech2_inference)\n",
|
|
|
@ -1153,16 +1151,18 @@
|
|
|
|
],
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
|
"vocoder = PWGGenerator(**pwg_config[\"generator_params\"])\n",
|
|
|
|
"vocoder = PWGGenerator(**pwg_config[\"generator_params\"])\n",
|
|
|
|
"\n",
|
|
|
|
"# 模型加载预训练参数\n",
|
|
|
|
"vocoder.set_state_dict(paddle.load(pwg_checkpoint)[\"generator_params\"]) # 模型加载预训练参数\n",
|
|
|
|
"vocoder.set_state_dict(paddle.load(pwg_checkpoint)[\"generator_params\"]) \n",
|
|
|
|
"vocoder.remove_weight_norm()\n",
|
|
|
|
"vocoder.remove_weight_norm()\n",
|
|
|
|
"vocoder.eval() # 推理阶段不启用 batch norm 和 dropout\n",
|
|
|
|
"# 推理阶段不启用 batch norm 和 dropout\n",
|
|
|
|
"\n",
|
|
|
|
"vocoder.eval()\n",
|
|
|
|
"stat = np.load(pwg_stat) # 读取数据预处理阶段数据集的均值和标准差\n",
|
|
|
|
"# 读取数据预处理阶段数据集的均值和标准差\n",
|
|
|
|
|
|
|
|
"stat = np.load(pwg_stat)\n",
|
|
|
|
"mu, std = stat\n",
|
|
|
|
"mu, std = stat\n",
|
|
|
|
"mu, std = paddle.to_tensor(mu), paddle.to_tensor(std)\n",
|
|
|
|
"mu, std = paddle.to_tensor(mu), paddle.to_tensor(std)\n",
|
|
|
|
"pwg_normalizer = ZScore(mu, std)\n",
|
|
|
|
"pwg_normalizer = ZScore(mu, std)\n",
|
|
|
|
"pwg_inference = PWGInference(pwg_normalizer, vocoder) # 构建归一化的模型\n",
|
|
|
|
"# 构建归一化的模型\n",
|
|
|
|
|
|
|
|
"pwg_inference = PWGInference(pwg_normalizer, vocoder)\n",
|
|
|
|
"pwg_inference.eval()\n",
|
|
|
|
"pwg_inference.eval()\n",
|
|
|
|
"print(\"Parallel WaveGAN done!\")"
|
|
|
|
"print(\"Parallel WaveGAN done!\")"
|
|
|
|
]
|
|
|
|
]
|
|
|
@ -1266,7 +1266,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 40,
|
|
|
|
"execution_count": 36,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
{
|
|
|
|