|
|
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
import argparse
|
|
|
|
import os
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import paddle
|
|
|
|
from paddle.io import BatchSampler
|
|
|
|
from paddle.io import DataLoader
|
|
|
|
from paddle.io import DistributedBatchSampler
|
|
|
|
from yacs.config import CfgNode
|
|
|
|
|
|
|
|
from paddleaudio.paddleaudio.compliance.librosa import melspectrogram
|
|
|
|
from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb
|
|
|
|
from paddlespeech.s2t.utils.log import Log
|
|
|
|
from paddlespeech.vector.io.augment import build_augment_pipeline
|
|
|
|
from paddlespeech.vector.io.augment import waveform_augment
|
|
|
|
from paddlespeech.vector.io.batch import feature_normalize
|
|
|
|
from paddlespeech.vector.io.batch import waveform_collate_fn
|
|
|
|
from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
|
|
|
|
from paddlespeech.vector.modules.loss import AdditiveAngularMargin
|
|
|
|
from paddlespeech.vector.modules.loss import LogSoftmaxWrapper
|
|
|
|
from paddlespeech.vector.modules.sid_model import SpeakerIdetification
|
|
|
|
from paddlespeech.vector.training.scheduler import CyclicLRScheduler
|
|
|
|
from paddlespeech.vector.training.seeding import seed_everything
|
|
|
|
from paddlespeech.vector.utils.time import Timer
|
|
|
|
|
|
|
|
logger = Log(__name__).getlog()
|
|
|
|
|
|
|
|
|
|
|
|
def main(args, config):
|
|
|
|
# stage0: set the training device, cpu or gpu
|
|
|
|
paddle.set_device(args.device)
|
|
|
|
|
|
|
|
# stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
|
|
|
|
paddle.distributed.init_parallel_env()
|
|
|
|
nranks = paddle.distributed.get_world_size()
|
|
|
|
local_rank = paddle.distributed.get_rank()
|
|
|
|
# set the random seed, it is a must for multiprocess training
|
|
|
|
seed_everything(config.seed)
|
|
|
|
|
|
|
|
# stage2: data prepare, such vox1 and vox2 data, and augment noise data and pipline
|
|
|
|
# note: some cmd must do in rank==0, so wo will refactor the data prepare code
|
|
|
|
train_dataset = VoxCeleb('train', target_dir=args.data_dir)
|
|
|
|
dev_dataset = VoxCeleb('dev', target_dir=args.data_dir)
|
|
|
|
|
|
|
|
if args.augment:
|
|
|
|
augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
|
|
|
|
else:
|
|
|
|
augment_pipeline = []
|
|
|
|
|
|
|
|
# stage3: build the dnn backbone model network
|
|
|
|
ecapa_tdnn = EcapaTdnn(**config.model)
|
|
|
|
|
|
|
|
# stage4: build the speaker verification train instance with backbone model
|
|
|
|
model = SpeakerIdetification(
|
|
|
|
backbone=ecapa_tdnn, num_class=VoxCeleb.num_speakers)
|
|
|
|
|
|
|
|
# stage5: build the optimizer, we now only construct the AdamW optimizer
|
|
|
|
lr_schedule = CyclicLRScheduler(
|
|
|
|
base_lr=config.learning_rate, max_lr=1e-3, step_size=140000 // nranks)
|
|
|
|
optimizer = paddle.optimizer.AdamW(
|
|
|
|
learning_rate=lr_schedule, parameters=model.parameters())
|
|
|
|
|
|
|
|
# stage6: build the loss function, we now only support LogSoftmaxWrapper
|
|
|
|
criterion = LogSoftmaxWrapper(
|
|
|
|
loss_fn=AdditiveAngularMargin(margin=0.2, scale=30))
|
|
|
|
|
|
|
|
# stage7: confirm training start epoch
|
|
|
|
# if pre-trained model exists, start epoch confirmed by the pre-trained model
|
|
|
|
start_epoch = 0
|
|
|
|
if args.load_checkpoint:
|
|
|
|
logger.info("load the check point")
|
|
|
|
args.load_checkpoint = os.path.abspath(
|
|
|
|
os.path.expanduser(args.load_checkpoint))
|
|
|
|
try:
|
|
|
|
# load model checkpoint
|
|
|
|
state_dict = paddle.load(
|
|
|
|
os.path.join(args.load_checkpoint, 'model.pdparams'))
|
|
|
|
model.set_state_dict(state_dict)
|
|
|
|
|
|
|
|
# load optimizer checkpoint
|
|
|
|
state_dict = paddle.load(
|
|
|
|
os.path.join(args.load_checkpoint, 'model.pdopt'))
|
|
|
|
optimizer.set_state_dict(state_dict)
|
|
|
|
if local_rank == 0:
|
|
|
|
logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
|
|
|
|
except FileExistsError:
|
|
|
|
if local_rank == 0:
|
|
|
|
logger.info('Train from scratch.')
|
|
|
|
|
|
|
|
try:
|
|
|
|
start_epoch = int(args.load_checkpoint[-1])
|
|
|
|
logger.info(f'Restore training from epoch {start_epoch}.')
|
|
|
|
except ValueError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
# stage8: we build the batch sampler for paddle.DataLoader
|
|
|
|
train_sampler = DistributedBatchSampler(
|
|
|
|
train_dataset,
|
|
|
|
batch_size=config.batch_size,
|
|
|
|
shuffle=True,
|
|
|
|
drop_last=False)
|
|
|
|
train_loader = DataLoader(
|
|
|
|
train_dataset,
|
|
|
|
batch_sampler=train_sampler,
|
|
|
|
num_workers=config.num_workers,
|
|
|
|
collate_fn=waveform_collate_fn,
|
|
|
|
return_list=True,
|
|
|
|
use_buffer_reader=True, )
|
|
|
|
|
|
|
|
# stage9: start to train
|
|
|
|
# we will comment the training process
|
|
|
|
steps_per_epoch = len(train_sampler)
|
|
|
|
timer = Timer(steps_per_epoch * config.epochs)
|
|
|
|
timer.start()
|
|
|
|
|
|
|
|
for epoch in range(start_epoch + 1, config.epochs + 1):
|
|
|
|
# at the begining, model must set to train mode
|
|
|
|
model.train()
|
|
|
|
|
|
|
|
avg_loss = 0
|
|
|
|
num_corrects = 0
|
|
|
|
num_samples = 0
|
|
|
|
for batch_idx, batch in enumerate(train_loader):
|
|
|
|
# stage 9-1: batch data is audio sample points and speaker id label
|
|
|
|
waveforms, labels = batch['waveforms'], batch['labels']
|
|
|
|
|
|
|
|
# stage 9-2: audio sample augment method, which is done on the audio sample point
|
|
|
|
if len(augment_pipeline) != 0:
|
|
|
|
waveforms = waveform_augment(waveforms, augment_pipeline)
|
|
|
|
labels = paddle.concat(
|
|
|
|
[labels for i in range(len(augment_pipeline) + 1)])
|
|
|
|
|
|
|
|
# stage 9-3: extract the audio feats,such fbank, mfcc, spectrogram
|
|
|
|
feats = []
|
|
|
|
for waveform in waveforms.numpy():
|
|
|
|
feat = melspectrogram(x=waveform, **config.feature)
|
|
|
|
feats.append(feat)
|
|
|
|
feats = paddle.to_tensor(np.asarray(feats))
|
|
|
|
|
|
|
|
# stage 9-4: feature normalize, which help converge and imporve the performance
|
|
|
|
feats = feature_normalize(
|
|
|
|
feats, mean_norm=True, std_norm=False) # Features normalization
|
|
|
|
|
|
|
|
# stage 9-5: model forward, such ecapa-tdnn, x-vector
|
|
|
|
logits = model(feats)
|
|
|
|
|
|
|
|
# stage 9-6: loss function criterion, such AngularMargin, AdditiveAngularMargin
|
|
|
|
loss = criterion(logits, labels)
|
|
|
|
|
|
|
|
# stage 9-7: update the gradient and clear the gradient cache
|
|
|
|
loss.backward()
|
|
|
|
optimizer.step()
|
|
|
|
if isinstance(optimizer._learning_rate,
|
|
|
|
paddle.optimizer.lr.LRScheduler):
|
|
|
|
optimizer._learning_rate.step()
|
|
|
|
optimizer.clear_grad()
|
|
|
|
|
|
|
|
# stage 9-8: Calculate average loss per batch
|
|
|
|
avg_loss += loss.numpy()[0]
|
|
|
|
|
|
|
|
# stage 9-9: Calculate metrics, which is one-best accuracy
|
|
|
|
preds = paddle.argmax(logits, axis=1)
|
|
|
|
num_corrects += (preds == labels).numpy().sum()
|
|
|
|
num_samples += feats.shape[0]
|
|
|
|
timer.count() # step plus one in timer
|
|
|
|
|
|
|
|
# stage 9-10: print the log information only on 0-rank per log-freq batchs
|
|
|
|
if (batch_idx + 1) % config.log_interval == 0 and local_rank == 0:
|
|
|
|
lr = optimizer.get_lr()
|
|
|
|
avg_loss /= config.log_interval
|
|
|
|
avg_acc = num_corrects / num_samples
|
|
|
|
|
|
|
|
print_msg = 'Train Epoch={}/{}, Step={}/{}'.format(
|
|
|
|
epoch, config.epochs, batch_idx + 1, steps_per_epoch)
|
|
|
|
print_msg += ' loss={:.4f}'.format(avg_loss)
|
|
|
|
print_msg += ' acc={:.4f}'.format(avg_acc)
|
|
|
|
print_msg += ' lr={:.4E} step/sec={:.2f} | ETA {}'.format(
|
|
|
|
lr, timer.timing, timer.eta)
|
|
|
|
logger.info(print_msg)
|
|
|
|
|
|
|
|
avg_loss = 0
|
|
|
|
num_corrects = 0
|
|
|
|
num_samples = 0
|
|
|
|
|
|
|
|
# stage 9-11: save the model parameters only on 0-rank per save-freq batchs
|
|
|
|
if epoch % config.save_interval == 0 and batch_idx + 1 == steps_per_epoch:
|
|
|
|
if local_rank != 0:
|
|
|
|
paddle.distributed.barrier(
|
|
|
|
) # Wait for valid step in main process
|
|
|
|
continue # Resume trainning on other process
|
|
|
|
|
|
|
|
# stage 9-12: construct the valid dataset dataloader
|
|
|
|
dev_sampler = BatchSampler(
|
|
|
|
dev_dataset,
|
|
|
|
batch_size=config.batch_size // 4,
|
|
|
|
shuffle=False,
|
|
|
|
drop_last=False)
|
|
|
|
dev_loader = DataLoader(
|
|
|
|
dev_dataset,
|
|
|
|
batch_sampler=dev_sampler,
|
|
|
|
collate_fn=waveform_collate_fn,
|
|
|
|
num_workers=config.num_workers,
|
|
|
|
return_list=True, )
|
|
|
|
|
|
|
|
# set the model to eval mode
|
|
|
|
model.eval()
|
|
|
|
num_corrects = 0
|
|
|
|
num_samples = 0
|
|
|
|
|
|
|
|
# stage 9-13: evaluation the valid dataset batch data
|
|
|
|
logger.info('Evaluate on validation dataset')
|
|
|
|
with paddle.no_grad():
|
|
|
|
for batch_idx, batch in enumerate(dev_loader):
|
|
|
|
waveforms, labels = batch['waveforms'], batch['labels']
|
|
|
|
|
|
|
|
feats = []
|
|
|
|
for waveform in waveforms.numpy():
|
|
|
|
# feat = melspectrogram(x=waveform, **cpu_feat_conf)
|
|
|
|
feat = melspectrogram(x=waveform, **config.feature)
|
|
|
|
feats.append(feat)
|
|
|
|
|
|
|
|
feats = paddle.to_tensor(np.asarray(feats))
|
|
|
|
feats = feature_normalize(
|
|
|
|
feats, mean_norm=True, std_norm=False)
|
|
|
|
logits = model(feats)
|
|
|
|
|
|
|
|
preds = paddle.argmax(logits, axis=1)
|
|
|
|
num_corrects += (preds == labels).numpy().sum()
|
|
|
|
num_samples += feats.shape[0]
|
|
|
|
|
|
|
|
print_msg = '[Evaluation result]'
|
|
|
|
print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
|
|
|
|
logger.info(print_msg)
|
|
|
|
|
|
|
|
# stage 9-14: Save model parameters
|
|
|
|
save_dir = os.path.join(args.checkpoint_dir,
|
|
|
|
'epoch_{}'.format(epoch))
|
|
|
|
logger.info('Saving model checkpoint to {}'.format(save_dir))
|
|
|
|
paddle.save(model.state_dict(),
|
|
|
|
os.path.join(save_dir, 'model.pdparams'))
|
|
|
|
paddle.save(optimizer.state_dict(),
|
|
|
|
os.path.join(save_dir, 'model.pdopt'))
|
|
|
|
|
|
|
|
if nranks > 1:
|
|
|
|
paddle.distributed.barrier() # Main process
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
# yapf: disable
|
|
|
|
parser = argparse.ArgumentParser(__doc__)
|
|
|
|
parser.add_argument('--device',
|
|
|
|
choices=['cpu', 'gpu'],
|
|
|
|
default="cpu",
|
|
|
|
help="Select which device to train model, defaults to gpu.")
|
|
|
|
parser.add_argument("--config",
|
|
|
|
default=None,
|
|
|
|
type=str,
|
|
|
|
help="configuration file")
|
|
|
|
parser.add_argument("--data-dir",
|
|
|
|
default="./data/",
|
|
|
|
type=str,
|
|
|
|
help="data directory")
|
|
|
|
parser.add_argument("--load-checkpoint",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Directory to load model checkpoint to contiune trainning.")
|
|
|
|
parser.add_argument("--checkpoint-dir",
|
|
|
|
type=str,
|
|
|
|
default='./checkpoint',
|
|
|
|
help="Directory to save model checkpoints.")
|
|
|
|
parser.add_argument("--augment",
|
|
|
|
action="store_true",
|
|
|
|
default=False,
|
|
|
|
help="Apply audio augments.")
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
# yapf: enable
|
|
|
|
|
|
|
|
# https://yaml.org/type/float.html
|
|
|
|
config = CfgNode(new_allowed=True)
|
|
|
|
if args.config:
|
|
|
|
config.merge_from_file(args.config)
|
|
|
|
|
|
|
|
config.freeze()
|
|
|
|
print(config)
|
|
|
|
|
|
|
|
main(args, config)
|