add sid learning rate and training model

pull/1523/head
xiongxinlei 2 years ago
parent dc28ebe4ee
commit 57c4f4a68c

@ -15,10 +15,14 @@ import argparse
import paddle
from dataset.voxceleb.voxceleb1 import VoxCeleb1
from paddleaudio.datasets.voxceleb import VoxCeleb1
from paddlespeech.vector.layers.lr import CyclicLRScheduler
from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
from paddlespeech.vector.training.sid_model import SpeakerIdetification
def main(args):
# stage0: set the training device, cpu or gpu
paddle.set_device(args.device)
# stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
@ -27,8 +31,32 @@ def main(args):
local_rank = paddle.distributed.get_rank()
# stage2: data prepare
# note: some cmd must do in rank==0
train_ds = VoxCeleb1('train', target_dir=args.data_dir)
# stage3: build the dnn backbone model network
model_conf = {
"input_size": 80,
"channels": [1024, 1024, 1024, 1024, 3072],
"kernel_sizes": [5, 3, 3, 3, 1],
"dilations": [1, 2, 3, 4, 1],
"attention_channels": 128,
"lin_neurons": 192,
}
ecapa_tdnn = EcapaTdnn(**model_conf)
# stage4: build the speaker verification train instance with backbone model
model = SpeakerIdetification(
backbone=ecapa_tdnn, num_class=VoxCeleb1.num_speakers)
# stage5: build the optimizer, we now only construct the AdamW optimizer
lr_schedule = CyclicLRScheduler(
base_lr=args.learning_rate, max_lr=1e-3, step_size=140000 // nranks)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_schedule, parameters=model.parameters())
# stage6: build the loss function, we now only support LogSoftmaxWrapper
if __name__ == "__main__":
# yapf: disable
@ -41,6 +69,10 @@ if __name__ == "__main__":
default="./data/",
type=str,
help="data directory")
parser.add_argument("--learning_rate",
type=float,
default=1e-8,
help="Learning rate used to train with warmup.")
args = parser.parse_args()
# yapf: enable

@ -0,0 +1,45 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.optimizer.lr import LRScheduler
class CyclicLRScheduler(LRScheduler):
def __init__(self,
base_lr: float=1e-8,
max_lr: float=1e-3,
step_size: int=10000):
super(CyclicLRScheduler, self).__init__()
self.current_step = -1
self.base_lr = base_lr
self.max_lr = max_lr
self.step_size = step_size
def step(self):
if not hasattr(self, 'current_step'):
return
self.current_step += 1
if self.current_step >= 2 * self.step_size:
self.current_step %= 2 * self.step_size
self.last_lr = self.get_lr()
def get_lr(self):
p = self.current_step / (2 * self.step_size) # Proportion in one cycle.
if p < 0.5: # Increase
return self.base_lr + p / 0.5 * (self.max_lr - self.base_lr)
else: # Decrease
return self.max_lr - (p / 0.5 - 1) * (self.max_lr - self.base_lr)

@ -0,0 +1,60 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
class SpeakerIdetification(nn.Layer):
def __init__(
self,
backbone,
num_class,
lin_blocks=0,
lin_neurons=192,
dropout=0.1, ):
super(SpeakerIdetification, self).__init__()
self.backbone = backbone
if dropout > 0:
self.dropout = nn.Dropout(dropout)
else:
self.dropout = None
input_size = self.backbone.emb_size
self.blocks = nn.LayerList()
for i in range(lin_blocks):
self.blocks.extend([
nn.BatchNorm1D(input_size),
nn.Linear(in_features=input_size, out_features=lin_neurons),
])
input_size = lin_neurons
self.weight = paddle.create_parameter(
shape=(input_size, num_class),
dtype='float32',
attr=paddle.ParamAttr(initializer=nn.initializer.XavierUniform()), )
def forward(self, x, lengths=None):
# x.shape: (N, C, L)
x = self.backbone(x, lengths).squeeze(
-1) # (N, emb_size, 1) -> (N, emb_size)
if self.dropout is not None:
x = self.dropout(x)
for fc in self.blocks:
x = fc(x)
logits = F.linear(F.normalize(x), F.normalize(self.weight, axis=0))
return logits
Loading…
Cancel
Save