From 82e04d7815a8cf1935f2fec5cddc03bdb87c8484 Mon Sep 17 00:00:00 2001
From: tianhao zhang <15600919271@163.com>
Date: Thu, 8 Sep 2022 12:28:19 +0000
Subject: [PATCH 01/15] fix trianer
---
examples/aishell/asr1/run.sh | 4 ++--
paddlespeech/s2t/training/trainer.py | 5 ++++-
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh
index bd4f50e3f..701dcd2ac 100644
--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
@@ -2,8 +2,8 @@
source path.sh
set -e
-gpus=0,1,2,3
-stage=0
+gpus=1
+stage=1
stop_stage=50
conf_path=conf/conformer.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index a7eb9892d..d1bd30fef 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -19,6 +19,9 @@ from pathlib import Path
import paddle
from paddle import distributed as dist
+import pdb
+pdb.set_trace()
+dist.init_parallel_env()
from visualdl import LogWriter
from paddlespeech.s2t.training.reporter import ObsScope
@@ -176,7 +179,7 @@ class Trainer():
def init_parallel(self):
"""Init environment for multiprocess training.
"""
- dist.init_parallel_env()
+ # dist.init_parallel_env()
@mp_tools.rank_zero_only
def save(self, tag=None, infos: dict=None):
From 0975a332c4652301d659be7adc6184a9236e980f Mon Sep 17 00:00:00 2001
From: TianYuan
Date: Fri, 9 Sep 2022 15:53:14 +0800
Subject: [PATCH 02/15] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index f17cec13a..59c61f776 100644
--- a/README.md
+++ b/README.md
@@ -888,7 +888,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
## Acknowledgement
-- Many thanks to [HighCWu](https://github.com/HighCWu)for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples.
+- Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples.
- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data.
- Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW).
- Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help.
From 9560d650dbfc5df59af39aa33ea74a0b4081796f Mon Sep 17 00:00:00 2001
From: tianhao zhang <15600919271@163.com>
Date: Fri, 9 Sep 2022 08:30:10 +0000
Subject: [PATCH 03/15] fix dp init
---
paddlespeech/s2t/training/trainer.py | 11 +----------
1 file changed, 1 insertion(+), 10 deletions(-)
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index d1bd30fef..1093e4a12 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -19,9 +19,8 @@ from pathlib import Path
import paddle
from paddle import distributed as dist
-import pdb
-pdb.set_trace()
dist.init_parallel_env()
+
from visualdl import LogWriter
from paddlespeech.s2t.training.reporter import ObsScope
@@ -125,9 +124,6 @@ class Trainer():
else:
raise Exception("invalid device")
- if self.parallel:
- self.init_parallel()
-
self.checkpoint = Checkpoint(
kbest_n=self.config.checkpoint.kbest_n,
latest_n=self.config.checkpoint.latest_n)
@@ -176,11 +172,6 @@ class Trainer():
"""
return self.args.ngpu > 1
- def init_parallel(self):
- """Init environment for multiprocess training.
- """
- # dist.init_parallel_env()
-
@mp_tools.rank_zero_only
def save(self, tag=None, infos: dict=None):
"""Save checkpoint (model parameters and optimizer states).
From 989b755e8e9cfba8e8bb5fad7f672275980e1c1e Mon Sep 17 00:00:00 2001
From: WongLaw <95171490+WongLaw@users.noreply.github.com>
Date: Fri, 9 Sep 2022 16:55:58 +0800
Subject: [PATCH 04/15] Revised must_neural_tone_words, test=doc. (#2370)
* Revised must_neural_tone_words.
---
paddlespeech/t2s/exps/vits/__init__.py | 2 +-
paddlespeech/t2s/frontend/tone_sandhi.py | 5 ++---
2 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/paddlespeech/t2s/exps/vits/__init__.py b/paddlespeech/t2s/exps/vits/__init__.py
index abf198b97..97043fd7b 100644
--- a/paddlespeech/t2s/exps/vits/__init__.py
+++ b/paddlespeech/t2s/exps/vits/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py
index 9fff4272c..10a9540c3 100644
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -30,7 +30,7 @@ class ToneSandhi():
'蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻',
'舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂',
'胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆',
- '老头', '老太', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂',
+ '戏弄', '将军', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂',
'精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿',
'窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台',
'码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算',
@@ -59,8 +59,7 @@ class ToneSandhi():
'下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨', '父亲', '母亲', '咕噜',
'邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', '幸福', '熟悉', '计划',
'扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', '凤凰', '拖沓', '寒碜',
- '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记', '戏弄',
- '将军'
+ '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记'
}
self.must_not_neural_tone_words = {
'男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
From 6745e9dd6b176123bd9dea80576945bad7f0a0ea Mon Sep 17 00:00:00 2001
From: tianhao zhang <15600919271@163.com>
Date: Fri, 9 Sep 2022 09:23:56 +0000
Subject: [PATCH 05/15] fix dp init
---
paddlespeech/s2t/models/u2/u2.py | 4 +--
paddlespeech/s2t/modules/attention.py | 36 +++++++++----------
.../s2t/modules/conformer_convolution.py | 4 +--
paddlespeech/s2t/modules/decoder_layer.py | 17 +++------
paddlespeech/s2t/modules/encoder.py | 14 +++-----
paddlespeech/s2t/modules/encoder_layer.py | 16 ++++-----
.../engine/asr/online/python/asr_engine.py | 3 +-
7 files changed, 38 insertions(+), 56 deletions(-)
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 813e1e529..8a9849492 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -605,8 +605,8 @@ class U2BaseModel(ASRInterface, nn.Layer):
xs: paddle.Tensor,
offset: int,
required_cache_size: int,
- att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
- cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
+ att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+ cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
""" Export interface for c++ call, give input chunk xs, and return
output from time 0 to current chunk.
diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
index 92990048d..2d236743a 100644
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -86,7 +86,7 @@ class MultiHeadedAttention(nn.Layer):
self,
value: paddle.Tensor,
scores: paddle.Tensor,
- mask: paddle.Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool)
+ mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool)
) -> paddle.Tensor:
"""Compute attention context vector.
Args:
@@ -127,15 +127,14 @@ class MultiHeadedAttention(nn.Layer):
return self.linear_out(x) # (batch, time1, d_model)
- def forward(
- self,
- query: paddle.Tensor,
- key: paddle.Tensor,
- value: paddle.Tensor,
- mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool)
- pos_emb: paddle.Tensor, # paddle.empty([0])
- cache: paddle.Tensor # paddle.zeros([0,0,0,0])
- ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+ def forward(self,
+ query: paddle.Tensor,
+ key: paddle.Tensor,
+ value: paddle.Tensor,
+ mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+ pos_emb: paddle.Tensor=paddle.empty([0]),
+ cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
+ ) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute scaled dot product attention.
Args:
query (paddle.Tensor): Query tensor (#batch, time1, size).
@@ -244,15 +243,14 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
return x
- def forward(
- self,
- query: paddle.Tensor,
- key: paddle.Tensor,
- value: paddle.Tensor,
- mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool)
- pos_emb: paddle.Tensor, # paddle.empty([0])
- cache: paddle.Tensor # paddle.zeros([0,0,0,0])
- ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+ def forward(self,
+ query: paddle.Tensor,
+ key: paddle.Tensor,
+ value: paddle.Tensor,
+ mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+ pos_emb: paddle.Tensor=paddle.empty([0]),
+ cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
+ ) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Args:
query (paddle.Tensor): Query tensor (#batch, time1, size).
diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py
index b35fea5b9..be6056546 100644
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
@@ -108,8 +108,8 @@ class ConvolutionModule(nn.Layer):
def forward(
self,
x: paddle.Tensor,
- mask_pad: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool)
- cache: paddle.Tensor # paddle.zeros([0,0,0,0])
+ mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+ cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute convolution module.
Args:
diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py
index c8843b723..37b124e84 100644
--- a/paddlespeech/s2t/modules/decoder_layer.py
+++ b/paddlespeech/s2t/modules/decoder_layer.py
@@ -121,16 +121,11 @@ class DecoderLayer(nn.Layer):
if self.concat_after:
tgt_concat = paddle.cat(
- (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask,
- paddle.empty([0]),
- paddle.zeros([0, 0, 0, 0]))[0]),
- dim=-1)
+ (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1)
x = residual + self.concat_linear1(tgt_concat)
else:
x = residual + self.dropout(
- self.self_attn(tgt_q, tgt, tgt, tgt_q_mask,
- paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[
- 0])
+ self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
if not self.normalize_before:
x = self.norm1(x)
@@ -139,15 +134,11 @@ class DecoderLayer(nn.Layer):
x = self.norm2(x)
if self.concat_after:
x_concat = paddle.cat(
- (x, self.src_attn(x, memory, memory, memory_mask,
- paddle.empty([0]),
- paddle.zeros([0, 0, 0, 0]))[0]),
- dim=-1)
+ (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1)
x = residual + self.concat_linear2(x_concat)
else:
x = residual + self.dropout(
- self.src_attn(x, memory, memory, memory_mask,
- paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[0])
+ self.src_attn(x, memory, memory, memory_mask)[0])
if not self.normalize_before:
x = self.norm2(x)
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index cf4e32fa4..2f4ad1b29 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -175,9 +175,7 @@ class BaseEncoder(nn.Layer):
decoding_chunk_size, self.static_chunk_size,
num_decoding_left_chunks)
for layer in self.encoders:
- xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad,
- paddle.zeros([0, 0, 0, 0]),
- paddle.zeros([0, 0, 0, 0]))
+ xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
if self.normalize_before:
xs = self.after_norm(xs)
# Here we assume the mask is not changed in encoder layers, so just
@@ -190,9 +188,9 @@ class BaseEncoder(nn.Layer):
xs: paddle.Tensor,
offset: int,
required_cache_size: int,
- att_cache: paddle.Tensor, # paddle.zeros([0,0,0,0])
- cnn_cache: paddle.Tensor, # paddle.zeros([0,0,0,0]),
- att_mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool)
+ att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+ cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+ att_mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool)
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
""" Forward just one chunk
Args:
@@ -255,7 +253,6 @@ class BaseEncoder(nn.Layer):
xs,
att_mask,
pos_emb,
- mask_pad=paddle.ones([0, 0, 0], dtype=paddle.bool),
att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache,
cnn_cache=cnn_cache[i:i + 1]
if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, )
@@ -328,8 +325,7 @@ class BaseEncoder(nn.Layer):
chunk_xs = xs[:, cur:end, :]
(y, att_cache, cnn_cache) = self.forward_chunk(
- chunk_xs, offset, required_cache_size, att_cache, cnn_cache,
- paddle.ones([0, 0, 0], dtype=paddle.bool))
+ chunk_xs, offset, required_cache_size, att_cache, cnn_cache)
outputs.append(y)
offset += y.shape[1]
diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py
index 4555b535f..dac62bce3 100644
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
@@ -76,10 +76,9 @@ class TransformerEncoderLayer(nn.Layer):
x: paddle.Tensor,
mask: paddle.Tensor,
pos_emb: paddle.Tensor,
- mask_pad: paddle.
- Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool)
- att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
- cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
+ mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+ att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+ cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Compute encoded features.
Args:
@@ -106,8 +105,7 @@ class TransformerEncoderLayer(nn.Layer):
if self.normalize_before:
x = self.norm1(x)
- x_att, new_att_cache = self.self_attn(
- x, x, x, mask, paddle.empty([0]), cache=att_cache)
+ x_att, new_att_cache = self.self_attn(x, x, x, mask, cache=att_cache)
if self.concat_after:
x_concat = paddle.concat((x, x_att), axis=-1)
@@ -195,9 +193,9 @@ class ConformerEncoderLayer(nn.Layer):
x: paddle.Tensor,
mask: paddle.Tensor,
pos_emb: paddle.Tensor,
- mask_pad: paddle.Tensor, #paddle.ones([0, 0, 0],dtype=paddle.bool)
- att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
- cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
+ mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+ att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+ cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Compute encoded features.
Args:
diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py
index 87d88ee60..5782d7035 100644
--- a/paddlespeech/server/engine/asr/online/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py
@@ -480,8 +480,7 @@ class PaddleASRConnectionHanddler:
self.offset,
required_cache_size,
att_cache=self.att_cache,
- cnn_cache=self.cnn_cache,
- att_mask=paddle.ones([0, 0, 0], dtype=paddle.bool))
+ cnn_cache=self.cnn_cache)
outputs.append(y)
# update the global offset, in decoding frame unit
From 08b9c45811110b3656bc6ad9844199055a9063d0 Mon Sep 17 00:00:00 2001
From: tianhao zhang <15600919271@163.com>
Date: Fri, 9 Sep 2022 09:26:41 +0000
Subject: [PATCH 06/15] fix dp init
---
examples/aishell/asr1/run.sh | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh
index 701dcd2ac..bd4f50e3f 100644
--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
@@ -2,8 +2,8 @@
source path.sh
set -e
-gpus=1
-stage=1
+gpus=0,1,2,3
+stage=0
stop_stage=50
conf_path=conf/conformer.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
From fdcc8c042762da5141fc5c59f43b37dfa28cfab7 Mon Sep 17 00:00:00 2001
From: TianYuan
Date: Fri, 9 Sep 2022 17:49:20 +0800
Subject: [PATCH 07/15] Update README.md
---
examples/aishell3_vctk/ernie_sat/README.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/examples/aishell3_vctk/ernie_sat/README.md b/examples/aishell3_vctk/ernie_sat/README.md
index 777bea326..a849488d5 100644
--- a/examples/aishell3_vctk/ernie_sat/README.md
+++ b/examples/aishell3_vctk/ernie_sat/README.md
@@ -29,9 +29,11 @@ Or train your MFA model reference to [mfa example](https://github.com/PaddlePadd
Assume the paths to the datasets are:
- `~/datasets/data_aishell3`
- `~/datasets/VCTK-Corpus-0.92`
+
Assume the path to the MFA results of the datasets are:
- `./aishell3_alignment_tone`
- `./vctk_alignment`
+
Run the command below to
1. **source path**.
2. preprocess the dataset.
From 663e3ab58ee21d24b2f6d28f5d1050fba84be088 Mon Sep 17 00:00:00 2001
From: tianhao zhang <15600919271@163.com>
Date: Fri, 9 Sep 2022 09:52:49 +0000
Subject: [PATCH 08/15] fix dp init
---
paddlespeech/s2t/training/trainer.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index 1093e4a12..4a69d78a4 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -19,7 +19,9 @@ from pathlib import Path
import paddle
from paddle import distributed as dist
-dist.init_parallel_env()
+world_size = dist.get_world_size()
+if world_size > 1:
+ dist.init_parallel_env()
from visualdl import LogWriter
From 08c3ceb04bde74735b204090600fc8bc2106a70b Mon Sep 17 00:00:00 2001
From: tianhao zhang <15600919271@163.com>
Date: Fri, 9 Sep 2022 15:40:37 +0000
Subject: [PATCH 09/15] update wenetspeech streaming coformer result
---
examples/wenetspeech/asr1/RESULTS.md | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md
index cc209db75..af84a5f6e 100644
--- a/examples/wenetspeech/asr1/RESULTS.md
+++ b/examples/wenetspeech/asr1/RESULTS.md
@@ -34,3 +34,15 @@ Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wen
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 |
+
+
+## Conformer Steaming Pretrained Model
+
+Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | 16 | 0.056273 |
+| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.078918 |
+| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 |
+| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.054401 |
From 445cb2b08c49632e08f9f847b0b062d32a507efa Mon Sep 17 00:00:00 2001
From: yuehuayingxueluo <867460659@qq.com>
Date: Tue, 13 Sep 2022 15:05:33 +0800
Subject: [PATCH 10/15] fix prepare.sh (#2376)
Co-authored-by: yuehuayingxueluo
---
tests/test_tipc/prepare.sh | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
mode change 100644 => 100755 tests/test_tipc/prepare.sh
diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh
old mode 100644
new mode 100755
index 2a2272813..cb05a1d0f
--- a/tests/test_tipc/prepare.sh
+++ b/tests/test_tipc/prepare.sh
@@ -15,6 +15,7 @@ dataline=$(cat ${FILENAME})
# parser params
IFS=$'\n'
lines=(${dataline})
+python=python
# The training params
model_name=$(func_parser_value "${lines[1]}")
@@ -68,7 +69,7 @@ if [[ ${MODE} = "benchmark_train" ]];then
if [[ ${model_name} == "pwgan" ]]; then
# 下载 csmsc 数据集并解压缩
- wget -nc https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar
+ wget -nc https://paddle-wheel.bj.bcebos.com/benchmark/BZNSYP.rar
mkdir -p BZNSYP
unrar x BZNSYP.rar BZNSYP
wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt
@@ -80,6 +81,10 @@ if [[ ${MODE} = "benchmark_train" ]];then
python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy
fi
+ echo "barrier start"
+ PYTHON="${python}" bash test_tipc/barrier.sh
+ echo "barrier end"
+
if [[ ${model_name} == "mdtc" ]]; then
# 下载 Snips 数据集并解压缩
wget https://paddlespeech.bj.bcebos.com/datasets/hey_snips_kws_4.0.tar.gz.1
From ec571bb0d113d5ab01324b0120438b8c1824f56b Mon Sep 17 00:00:00 2001
From: TianYuan
Date: Tue, 13 Sep 2022 16:56:03 +0800
Subject: [PATCH 11/15] Update README.md
---
examples/voxceleb/sv0/README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/voxceleb/sv0/README.md b/examples/voxceleb/sv0/README.md
index 26c95aca9..7fe759ebc 100644
--- a/examples/voxceleb/sv0/README.md
+++ b/examples/voxceleb/sv0/README.md
@@ -148,4 +148,4 @@ source path.sh
CUDA_VISIBLE_DEVICES= bash ./local/test.sh ./data sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_1/model/ conf/ecapa_tdnn.yaml
```
-The performance of the released models are shown in [this](./RESULTS.md)
+The performance of the released models are shown in [this](./RESULT.md)
From 80b180217df310b8738c06577c88965bab38f160 Mon Sep 17 00:00:00 2001
From: TianYuan
Date: Wed, 14 Sep 2022 10:37:03 +0800
Subject: [PATCH 12/15] [TTS] fix some bugs of ERNIE-SAT (#2378)
* fix ernie_sat, test=tts
* fix for comments, test=tts
---
.../ernie_sat/local/synthesize_e2e.sh | 6 ++--
.../ernie_sat/local/synthesize_e2e.sh | 6 ++--
.../vctk/ernie_sat/local/synthesize_e2e.sh | 6 ++--
paddlespeech/t2s/exps/ernie_sat/align.py | 4 +--
.../t2s/exps/ernie_sat/synthesize_e2e.py | 28 +++++++++++--------
5 files changed, 27 insertions(+), 23 deletions(-)
diff --git a/examples/aishell3/ernie_sat/local/synthesize_e2e.sh b/examples/aishell3/ernie_sat/local/synthesize_e2e.sh
index b33e8ca09..77b353b52 100755
--- a/examples/aishell3/ernie_sat/local/synthesize_e2e.sh
+++ b/examples/aishell3/ernie_sat/local/synthesize_e2e.sh
@@ -13,9 +13,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \
- --wav_path=source/SSB03540307.wav\
- --old_str='请播放歌曲小苹果。' \
- --new_str='歌曲真好听。' \
+ --wav_path=source/SSB03540307.wav \
+ --old_str='请播放歌曲小苹果' \
+ --new_str='歌曲真好听' \
--source_lang=zh \
--target_lang=zh \
--erniesat_config=${config_path} \
diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh
index c30af6e85..446ac8791 100755
--- a/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh
+++ b/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh
@@ -15,7 +15,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \
--wav_path=source/p243_313.wav \
- --old_str='For that reason cover should not be given.' \
+ --old_str='For that reason cover should not be given' \
--new_str='今天天气很好' \
--source_lang=en \
--target_lang=zh \
@@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \
--wav_path=source/SSB03540307.wav \
- --old_str='请播放歌曲小苹果。' \
- --new_str="Thank you!" \
+ --old_str='请播放歌曲小苹果' \
+ --new_str="Thank you" \
--source_lang=zh \
--target_lang=en \
--erniesat_config=${config_path} \
diff --git a/examples/vctk/ernie_sat/local/synthesize_e2e.sh b/examples/vctk/ernie_sat/local/synthesize_e2e.sh
index fee540169..dcc710447 100755
--- a/examples/vctk/ernie_sat/local/synthesize_e2e.sh
+++ b/examples/vctk/ernie_sat/local/synthesize_e2e.sh
@@ -14,7 +14,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \
--wav_path=source/p243_313.wav \
- --old_str='For that reason cover should not be given.' \
+ --old_str='For that reason cover should not be given' \
--new_str='I love you very much do you love me' \
--source_lang=en \
--target_lang=en \
@@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=edit \
--wav_path=source/p243_313.wav \
- --old_str='For that reason cover should not be given.' \
- --new_str='For that reason cover is not impossible to be given.' \
+ --old_str='For that reason cover should not be given' \
+ --new_str='For that reason cover is not impossible to be given' \
--source_lang=en \
--target_lang=en \
--erniesat_config=${config_path} \
diff --git a/paddlespeech/t2s/exps/ernie_sat/align.py b/paddlespeech/t2s/exps/ernie_sat/align.py
index 464f51a3b..8dbe685f5 100755
--- a/paddlespeech/t2s/exps/ernie_sat/align.py
+++ b/paddlespeech/t2s/exps/ernie_sat/align.py
@@ -58,7 +58,7 @@ def _readtg(tg_path: str, lang: str='en', fs: int=24000, n_shift: int=300):
durations[-2] += durations[-1]
durations = durations[:-1]
- # replace ' and 'sil' with 'sp'
+ # replace '' and 'sil' with 'sp'
phones = ['sp' if (phn == '' or phn == 'sil') else phn for phn in phones]
if lang == 'en':
@@ -195,7 +195,7 @@ def words2phns(text: str, lang='en'):
wrd = wrd.upper()
if (wrd not in ds):
wrd2phns[str(index) + '_' + wrd] = 'spn'
- phns.extend('spn')
+ phns.extend(['spn'])
else:
wrd2phns[str(index) + '_' + wrd] = word2phns_dict[wrd].split()
phns.extend(word2phns_dict[wrd].split())
diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
index 21c9ae044..e450aa1a0 100644
--- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
@@ -137,9 +137,6 @@ def prep_feats_with_dur(wav_path: str,
new_wav = np.concatenate(
[wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]])
- # 音频是正常遮住了
- sf.write(str("mask_wav.wav"), new_wav, samplerate=fs)
-
# 4. get old and new mel span to be mask
old_span_bdy = get_span_bdy(
mfa_start=mfa_start, mfa_end=mfa_end, span_to_repl=span_to_repl)
@@ -274,7 +271,8 @@ def get_wav(wav_path: str,
new_str: str='',
duration_adjust: bool=True,
fs: int=24000,
- n_shift: int=300):
+ n_shift: int=300,
+ task_name: str='synthesize'):
outs = get_mlm_output(
wav_path=wav_path,
@@ -298,9 +296,11 @@ def get_wav(wav_path: str,
alt_wav = np.squeeze(alt_wav)
old_time_bdy = [n_shift * x for x in old_span_bdy]
- wav_replaced = np.concatenate(
- [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])
-
+ if task_name == 'edit':
+ wav_replaced = np.concatenate(
+ [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])
+ else:
+ wav_replaced = alt_wav
wav_dict = {"origin": wav_org, "output": wav_replaced}
return wav_dict
@@ -356,7 +356,11 @@ def parse_args():
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
# ernie sat related
- parser.add_argument("--task_name", type=str, help="task name")
+ parser.add_argument(
+ "--task_name",
+ type=str,
+ choices=['edit', 'synthesize'],
+ help="task name.")
parser.add_argument("--wav_path", type=str, help="path of old wav")
parser.add_argument("--old_str", type=str, help="old string")
parser.add_argument("--new_str", type=str, help="new string")
@@ -410,10 +414,9 @@ if __name__ == '__main__':
if args.task_name == 'edit':
new_str = new_str
elif args.task_name == 'synthesize':
- new_str = old_str + new_str
+ new_str = old_str + ' ' + new_str
else:
- new_str = old_str + new_str
- print("new_str:", new_str)
+ new_str = old_str + ' ' + new_str
# Extractor
mel_extractor = LogMelFBank(
@@ -467,7 +470,8 @@ if __name__ == '__main__':
new_str=new_str,
duration_adjust=args.duration_adjust,
fs=erniesat_config.fs,
- n_shift=erniesat_config.n_shift)
+ n_shift=erniesat_config.n_shift,
+ task_name=args.task_name)
sf.write(
args.output_name, wav_dict['output'], samplerate=erniesat_config.fs)
From 02679906e649bf123c15e988bf84facd885aa7ee Mon Sep 17 00:00:00 2001
From: TianYuan
Date: Wed, 14 Sep 2022 15:22:25 +0800
Subject: [PATCH 13/15] Update tts_papers.md
---
docs/source/tts/tts_papers.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/docs/source/tts/tts_papers.md b/docs/source/tts/tts_papers.md
index 681b21066..f3ca1b624 100644
--- a/docs/source/tts/tts_papers.md
+++ b/docs/source/tts/tts_papers.md
@@ -5,6 +5,7 @@
- [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf)
- [Polyphone Disambiguation in Mandarin Chinese with Semi-Supervised Learning](https://www.isca-speech.org/archive/pdfs/interspeech_2021/shi21d_interspeech.pdf)
* github: https://github.com/PaperMechanica/SemiPPL
+- [WikipediaHomographData](https://github.com/google-research-datasets/WikipediaHomographData)
### Text Normalization
#### English
- [applenob/text_normalization](https://github.com/applenob/text_normalization)
From 324b166c5293323082e2c326d728618fd05fcac0 Mon Sep 17 00:00:00 2001
From: WongLaw <95171490+WongLaw@users.noreply.github.com>
Date: Wed, 14 Sep 2022 16:11:12 +0800
Subject: [PATCH 14/15] Removed useless spk_id in speech_server and
streaming_tts_server from demos, and support bilingual server engine,
test=tts (#2380)
* Removed useless spk_id in speech_server and streaming_tts_server from demos, and support bilingual server engine.
---
demos/speech_server/conf/application.yaml | 4 ++--
demos/streaming_tts_server/conf/tts_online_application.yaml | 3 +--
.../streaming_tts_server/conf/tts_online_ws_application.yaml | 3 +--
paddlespeech/server/engine/engine_warmup.py | 4 +++-
4 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml
index 9c171c470..b5ee80095 100644
--- a/demos/speech_server/conf/application.yaml
+++ b/demos/speech_server/conf/application.yaml
@@ -61,7 +61,7 @@ tts_python:
phones_dict:
tones_dict:
speaker_dict:
- spk_id: 0
+
# voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
# 'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc',
@@ -87,7 +87,7 @@ tts_inference:
phones_dict:
tones_dict:
speaker_dict:
- spk_id: 0
+
am_predictor_conf:
device: # set 'gpu:id' or 'cpu'
diff --git a/demos/streaming_tts_server/conf/tts_online_application.yaml b/demos/streaming_tts_server/conf/tts_online_application.yaml
index e617912fe..f5ec9dc8e 100644
--- a/demos/streaming_tts_server/conf/tts_online_application.yaml
+++ b/demos/streaming_tts_server/conf/tts_online_application.yaml
@@ -29,7 +29,7 @@ tts_online:
phones_dict:
tones_dict:
speaker_dict:
- spk_id: 0
+
# voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
# Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
@@ -70,7 +70,6 @@ tts_online-onnx:
phones_dict:
tones_dict:
speaker_dict:
- spk_id: 0
am_sample_rate: 24000
am_sess_conf:
device: "cpu" # set 'gpu:id' or 'cpu'
diff --git a/demos/streaming_tts_server/conf/tts_online_ws_application.yaml b/demos/streaming_tts_server/conf/tts_online_ws_application.yaml
index 329f882cc..c65633917 100644
--- a/demos/streaming_tts_server/conf/tts_online_ws_application.yaml
+++ b/demos/streaming_tts_server/conf/tts_online_ws_application.yaml
@@ -29,7 +29,7 @@ tts_online:
phones_dict:
tones_dict:
speaker_dict:
- spk_id: 0
+
# voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
# Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
@@ -70,7 +70,6 @@ tts_online-onnx:
phones_dict:
tones_dict:
speaker_dict:
- spk_id: 0
am_sample_rate: 24000
am_sess_conf:
device: "cpu" # set 'gpu:id' or 'cpu'
diff --git a/paddlespeech/server/engine/engine_warmup.py b/paddlespeech/server/engine/engine_warmup.py
index 3751554c2..ff65dff97 100644
--- a/paddlespeech/server/engine/engine_warmup.py
+++ b/paddlespeech/server/engine/engine_warmup.py
@@ -27,8 +27,10 @@ def warm_up(engine_and_type: str, warm_up_time: int=3) -> bool:
sentence = "您好,欢迎使用语音合成服务。"
elif tts_engine.lang == 'en':
sentence = "Hello and welcome to the speech synthesis service."
+ elif tts_engine.lang == 'mix':
+ sentence = "您好,欢迎使用TTS多语种服务。"
else:
- logger.error("tts engine only support lang: zh or en.")
+ logger.error("tts engine only support lang: zh or en or mix.")
sys.exit(-1)
if engine_and_type == "tts_python":
From cdf095595f0398ac0fb20d9cd6f80672c5c00d0c Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Thu, 15 Sep 2022 15:47:59 +0800
Subject: [PATCH 15/15] [tts] finetune add frozen (#2385)
* finetune add frozen
---
examples/other/tts_finetune/tts3/README.md | 9 +
examples/other/tts_finetune/tts3/finetune.py | 43 ++++-
.../other/tts_finetune/tts3/finetune.yaml | 12 ++
.../other/tts_finetune/tts3/local/extract.py | 7 +-
.../other/tts_finetune/tts3/local/train.py | 178 ++++++++++++++++++
examples/other/tts_finetune/tts3/run.sh | 12 +-
6 files changed, 242 insertions(+), 19 deletions(-)
create mode 100644 examples/other/tts_finetune/tts3/finetune.yaml
create mode 100644 examples/other/tts_finetune/tts3/local/train.py
diff --git a/examples/other/tts_finetune/tts3/README.md b/examples/other/tts_finetune/tts3/README.md
index 1ad30328b..192ee7ff4 100644
--- a/examples/other/tts_finetune/tts3/README.md
+++ b/examples/other/tts_finetune/tts3/README.md
@@ -75,6 +75,15 @@ When "Prepare" done. The structure of the current directory is listed below.
```
+### Set finetune.yaml
+`finetune.yaml` contains some configurations for fine-tuning. You can try various options to fine better result.
+Arguments:
+ - `batch_size`: finetune batch size. Default: -1, means 64 which same to pretrained model
+ - `learning_rate`: learning rate. Default: 0.0001
+ - `num_snapshots`: number of save models. Default: -1, means 5 which same to pretrained model
+ - `frozen_layers`: frozen layers. must be a list. If you don't want to frozen any layer, set [].
+
+
## Get Started
Run the command below to
diff --git a/examples/other/tts_finetune/tts3/finetune.py b/examples/other/tts_finetune/tts3/finetune.py
index 0f060b44d..207e2dbc5 100644
--- a/examples/other/tts_finetune/tts3/finetune.py
+++ b/examples/other/tts_finetune/tts3/finetune.py
@@ -14,6 +14,7 @@
import argparse
import os
from pathlib import Path
+from typing import List
from typing import Union
import yaml
@@ -21,10 +22,10 @@ from local.check_oov import get_check_result
from local.extract import extract_feature
from local.label_process import get_single_label
from local.prepare_env import generate_finetune_env
+from local.train import train_sp
from paddle import distributed as dist
from yacs.config import CfgNode
-from paddlespeech.t2s.exps.fastspeech2.train import train_sp
from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
DICT_EN = 'tools/aligner/cmudict-0.7b'
@@ -38,15 +39,24 @@ os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
class TrainArgs():
- def __init__(self, ngpu, config_file, dump_dir: Path, output_dir: Path):
+ def __init__(self,
+ ngpu,
+ config_file,
+ dump_dir: Path,
+ output_dir: Path,
+ frozen_layers: List[str]):
+ # config: fastspeech2 config file.
self.config = str(config_file)
self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl")
self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl")
+ # model output dir.
self.output_dir = str(output_dir)
self.ngpu = ngpu
self.phones_dict = str(dump_dir / "phone_id_map.txt")
self.speaker_dict = str(dump_dir / "speaker_id_map.txt")
self.voice_cloning = False
+ # frozen layers
+ self.frozen_layers = frozen_layers
def get_mfa_result(
@@ -122,12 +132,11 @@ if __name__ == '__main__':
"--ngpu", type=int, default=2, help="if ngpu=0, use cpu.")
parser.add_argument("--epoch", type=int, default=100, help="finetune epoch")
-
parser.add_argument(
- "--batch_size",
- type=int,
- default=-1,
- help="batch size, default -1 means same as pretrained model")
+ "--finetune_config",
+ type=str,
+ default="./finetune.yaml",
+ help="Path to finetune config file")
args = parser.parse_args()
@@ -147,8 +156,14 @@ if __name__ == '__main__':
with open(config_file) as f:
config = CfgNode(yaml.safe_load(f))
config.max_epoch = config.max_epoch + args.epoch
- if args.batch_size > 0:
- config.batch_size = args.batch_size
+
+ with open(args.finetune_config) as f2:
+ finetune_config = CfgNode(yaml.safe_load(f2))
+ config.batch_size = finetune_config.batch_size if finetune_config.batch_size > 0 else config.batch_size
+ config.optimizer.learning_rate = finetune_config.learning_rate if finetune_config.learning_rate > 0 else config.optimizer.learning_rate
+ config.num_snapshots = finetune_config.num_snapshots if finetune_config.num_snapshots > 0 else config.num_snapshots
+ frozen_layers = finetune_config.frozen_layers
+ assert type(frozen_layers) == list, "frozen_layers should be set a list."
if args.lang == 'en':
lexicon_file = DICT_EN
@@ -158,6 +173,13 @@ if __name__ == '__main__':
mfa_phone_file = MFA_PHONE_ZH
else:
print('please input right lang!!')
+
+ print(f"finetune max_epoch: {config.max_epoch}")
+ print(f"finetune batch_size: {config.batch_size}")
+ print(f"finetune learning_rate: {config.optimizer.learning_rate}")
+ print(f"finetune num_snapshots: {config.num_snapshots}")
+ print(f"finetune frozen_layers: {frozen_layers}")
+
am_phone_file = pretrained_model_dir / "phone_id_map.txt"
label_file = input_dir / "labels.txt"
@@ -181,7 +203,8 @@ if __name__ == '__main__':
generate_finetune_env(output_dir, pretrained_model_dir)
# create a new args for training
- train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir)
+ train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir,
+ frozen_layers)
# finetune models
# dispatch
diff --git a/examples/other/tts_finetune/tts3/finetune.yaml b/examples/other/tts_finetune/tts3/finetune.yaml
new file mode 100644
index 000000000..374a69f3d
--- /dev/null
+++ b/examples/other/tts_finetune/tts3/finetune.yaml
@@ -0,0 +1,12 @@
+###########################################################
+# PARAS SETTING #
+###########################################################
+# Set to -1 to indicate that the parameter is the same as the pretrained model configuration
+
+batch_size: -1
+learning_rate: 0.0001 # learning rate
+num_snapshots: -1
+
+# frozen_layers should be a list
+# if you don't need to freeze, set frozen_layers to []
+frozen_layers: ["encoder", "duration_predictor"]
diff --git a/examples/other/tts_finetune/tts3/local/extract.py b/examples/other/tts_finetune/tts3/local/extract.py
index edd92420b..630b58ce3 100644
--- a/examples/other/tts_finetune/tts3/local/extract.py
+++ b/examples/other/tts_finetune/tts3/local/extract.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
-import math
import os
from operator import itemgetter
from pathlib import Path
@@ -211,9 +210,9 @@ def extract_feature(duration_file: str,
mel_extractor, pitch_extractor, energy_extractor = get_extractor(config)
wav_files = sorted(list((input_dir).rglob("*.wav")))
- # split data into 3 sections, train: 80%, dev: 10%, test: 10%
- num_train = math.ceil(len(wav_files) * 0.8)
- num_dev = math.ceil(len(wav_files) * 0.1)
+ # split data into 3 sections, train: len(wav_files) - 2, dev: 1, test: 1
+ num_train = len(wav_files) - 2
+ num_dev = 1
print(num_train, num_dev)
train_wav_files = wav_files[:num_train]
diff --git a/examples/other/tts_finetune/tts3/local/train.py b/examples/other/tts_finetune/tts3/local/train.py
new file mode 100644
index 000000000..d065ae593
--- /dev/null
+++ b/examples/other/tts_finetune/tts3/local/train.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+import shutil
+from pathlib import Path
+from typing import List
+
+import jsonlines
+import numpy as np
+import paddle
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+
+from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn
+from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Evaluator
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Updater
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.optimizer import build_optimizers
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+def freeze_layer(model, layers: List[str]):
+ """freeze layers
+
+ Args:
+ layers (List[str]): frozen layers
+ """
+ for layer in layers:
+ for param in eval("model." + layer + ".parameters()"):
+ param.trainable = False
+
+
+def train_sp(args, config):
+ # decides device type and whether to run in parallel
+ # setup running environment correctly
+ if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+ paddle.set_device("cpu")
+ else:
+ paddle.set_device("gpu")
+ world_size = paddle.distributed.get_world_size()
+ if world_size > 1:
+ paddle.distributed.init_parallel_env()
+
+ # set the random seed, it is a must for multiprocess training
+ seed_everything(config.seed)
+
+ print(
+ f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+ )
+ fields = [
+ "text", "text_lengths", "speech", "speech_lengths", "durations",
+ "pitch", "energy"
+ ]
+ converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
+ spk_num = None
+ if args.speaker_dict is not None:
+ print("multiple speaker fastspeech2!")
+ collate_fn = fastspeech2_multi_spk_batch_fn
+ with open(args.speaker_dict, 'rt') as f:
+ spk_id = [line.strip().split() for line in f.readlines()]
+ spk_num = len(spk_id)
+ fields += ["spk_id"]
+ elif args.voice_cloning:
+ print("Training voice cloning!")
+ collate_fn = fastspeech2_multi_spk_batch_fn
+ fields += ["spk_emb"]
+ converters["spk_emb"] = np.load
+ else:
+ print("single speaker fastspeech2!")
+ collate_fn = fastspeech2_single_spk_batch_fn
+ print("spk_num:", spk_num)
+
+ # dataloader has been too verbose
+ logging.getLogger("DataLoader").disabled = True
+
+ # construct dataset for training and validation
+ with jsonlines.open(args.train_metadata, 'r') as reader:
+ train_metadata = list(reader)
+ train_dataset = DataTable(
+ data=train_metadata,
+ fields=fields,
+ converters=converters, )
+ with jsonlines.open(args.dev_metadata, 'r') as reader:
+ dev_metadata = list(reader)
+ dev_dataset = DataTable(
+ data=dev_metadata,
+ fields=fields,
+ converters=converters, )
+
+ # collate function and dataloader
+
+ train_sampler = DistributedBatchSampler(
+ train_dataset,
+ batch_size=config.batch_size,
+ shuffle=True,
+ drop_last=True)
+
+ print("samplers done!")
+
+ train_dataloader = DataLoader(
+ train_dataset,
+ batch_sampler=train_sampler,
+ collate_fn=collate_fn,
+ num_workers=config.num_workers)
+
+ dev_dataloader = DataLoader(
+ dev_dataset,
+ shuffle=False,
+ drop_last=False,
+ batch_size=config.batch_size,
+ collate_fn=collate_fn,
+ num_workers=config.num_workers)
+ print("dataloaders done!")
+
+ with open(args.phones_dict, "r") as f:
+ phn_id = [line.strip().split() for line in f.readlines()]
+ vocab_size = len(phn_id)
+ print("vocab_size:", vocab_size)
+
+ odim = config.n_mels
+ model = FastSpeech2(
+ idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
+
+ # freeze layer
+ if args.frozen_layers != []:
+ freeze_layer(model, args.frozen_layers)
+
+ if world_size > 1:
+ model = DataParallel(model)
+ print("model done!")
+
+ optimizer = build_optimizers(model, **config["optimizer"])
+ print("optimizer done!")
+
+ output_dir = Path(args.output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+ if dist.get_rank() == 0:
+ config_name = args.config.split("/")[-1]
+ # copy conf to output_dir
+ shutil.copyfile(args.config, output_dir / config_name)
+
+ updater = FastSpeech2Updater(
+ model=model,
+ optimizer=optimizer,
+ dataloader=train_dataloader,
+ output_dir=output_dir,
+ **config["updater"])
+
+ trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+
+ evaluator = FastSpeech2Evaluator(
+ model, dev_dataloader, output_dir=output_dir, **config["updater"])
+
+ if dist.get_rank() == 0:
+ trainer.extend(evaluator, trigger=(1, "epoch"))
+ trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+ trainer.extend(
+ Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+ trainer.run()
diff --git a/examples/other/tts_finetune/tts3/run.sh b/examples/other/tts_finetune/tts3/run.sh
index 9bb7ec6f0..9c877e642 100755
--- a/examples/other/tts_finetune/tts3/run.sh
+++ b/examples/other/tts_finetune/tts3/run.sh
@@ -10,11 +10,12 @@ mfa_dir=./mfa_result
dump_dir=./dump
output_dir=./exp/default
lang=zh
-ngpu=2
+ngpu=1
+finetune_config=./finetune.yaml
-ckpt=snapshot_iter_96600
+ckpt=snapshot_iter_96699
-gpus=0,1
+gpus=1
CUDA_VISIBLE_DEVICES=${gpus}
stage=0
stop_stage=100
@@ -35,7 +36,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--output_dir=${output_dir} \
--lang=${lang} \
--ngpu=${ngpu} \
- --epoch=100
+ --epoch=100 \
+ --finetune_config=${finetune_config}
fi
@@ -54,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
- --output_dir=./test_e2e \
+ --output_dir=./test_e2e/ \
--phones_dict=${dump_dir}/phone_id_map.txt \
--speaker_dict=${dump_dir}/speaker_id_map.txt \
--spk_id=0