You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/paddlespeech/vector/exps/ecapa_tdnn/test.py

303 lines
12 KiB

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import numpy as np
import paddle
from paddle.io import BatchSampler
from paddle.io import DataLoader
3 years ago
from paddleaudio.metric import compute_eer
from tqdm import tqdm
from yacs.config import CfgNode
from paddlespeech.s2t.utils.log import Log
from paddlespeech.vector.io.batch import batch_feature_normalize
from paddlespeech.vector.io.dataset import CSVDataset
from paddlespeech.vector.io.embedding_norm import InputNormalization
from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
from paddlespeech.vector.modules.sid_model import SpeakerIdetification
from paddlespeech.vector.training.seeding import seed_everything
logger = Log(__name__).getlog()
def compute_dataset_embedding(data_loader, model, mean_var_norm_emb, config,
id2embedding):
"""compute the dataset embeddings
Args:
data_loader (paddle.io.Dataloader): the dataset loader to be compute the embedding
model (paddle.nn.Layer): the speaker verification model
mean_var_norm_emb : compute the embedding mean and std norm
config (yacs.config.CfgNode): the yaml config
"""
logger.info(
f'Computing embeddings on {data_loader.dataset.csv_path} dataset')
with paddle.no_grad():
for batch_idx, batch in enumerate(tqdm(data_loader)):
# stage 8-1: extrac the audio embedding
ids, feats, lengths = batch['ids'], batch['feats'], batch['lengths']
embeddings = model.backbone(feats, lengths).squeeze(
-1) # (N, emb_size, 1) -> (N, emb_size)
# Global embedding normalization.
# if we use the global embedding norm
# eer can reduece about relative 10%
if config.global_embedding_norm and mean_var_norm_emb:
lengths = paddle.ones([embeddings.shape[0]])
embeddings = mean_var_norm_emb(embeddings, lengths)
# Update embedding dict.
id2embedding.update(dict(zip(ids, embeddings)))
def compute_verification_scores(id2embedding, train_cohort, config):
"""Compute the verification trial scores
Args:
id2embedding (dict): the utterance embedding
train_cohort (paddle.tensor): the cohort dataset embedding
config (yacs.config.CfgNode): the yaml config
Returns:
the scores and the trial labels,
1 refers the target and 0 refers the nontarget in labels
"""
labels = []
enroll_ids = []
test_ids = []
logger.info(f"read the trial from {config.verification_file}")
cos_sim_func = paddle.nn.CosineSimilarity(axis=-1)
scores = []
with open(config.verification_file, 'r') as f:
for line in f.readlines():
label, enroll_id, test_id = line.strip().split(' ')
enroll_id = enroll_id.split('.')[0].replace('/', '-')
test_id = test_id.split('.')[0].replace('/', '-')
labels.append(int(label))
enroll_emb = id2embedding[enroll_id]
test_emb = id2embedding[test_id]
score = cos_sim_func(enroll_emb, test_emb).item()
if "score_norm" in config:
# Getting norm stats for enroll impostors
enroll_rep = paddle.tile(
enroll_emb, repeat_times=[train_cohort.shape[0], 1])
score_e_c = cos_sim_func(enroll_rep, train_cohort)
if "cohort_size" in config:
score_e_c, _ = paddle.topk(
score_e_c, k=config.cohort_size, axis=0)
mean_e_c = paddle.mean(score_e_c, axis=0)
std_e_c = paddle.std(score_e_c, axis=0)
# Getting norm stats for test impostors
test_rep = paddle.tile(
test_emb, repeat_times=[train_cohort.shape[0], 1])
score_t_c = cos_sim_func(test_rep, train_cohort)
if "cohort_size" in config:
score_t_c, _ = paddle.topk(
score_t_c, k=config.cohort_size, axis=0)
mean_t_c = paddle.mean(score_t_c, axis=0)
std_t_c = paddle.std(score_t_c, axis=0)
if config.score_norm == "s-norm":
score_e = (score - mean_e_c) / std_e_c
score_t = (score - mean_t_c) / std_t_c
score = 0.5 * (score_e + score_t)
elif config.score_norm == "z-norm":
score = (score - mean_e_c) / std_e_c
elif config.score_norm == "t-norm":
score = (score - mean_t_c) / std_t_c
scores.append(score)
return scores, labels
def main(args, config):
"""The main process for test the speaker verification model
Args:
args (argparse.Namespace): the command line args namespace
config (yacs.config.CfgNode): the yaml config
"""
# stage0: set the training device, cpu or gpu
# if set the gpu, paddlespeech will select a gpu according the env CUDA_VISIBLE_DEVICES
paddle.set_device(args.device)
# set the random seed, it is the necessary measures for multiprocess training
seed_everything(config.seed)
# stage1: build the dnn backbone model network
# we will extract the audio embedding from the backbone model
ecapa_tdnn = EcapaTdnn(**config.model)
# stage2: build the speaker verification eval instance with backbone model
# because the checkpoint dict name has the SpeakerIdetification prefix
# so we need to create the SpeakerIdetification instance
# but we acutally use the backbone model to extact the audio embedding
model = SpeakerIdetification(
backbone=ecapa_tdnn, num_class=config.num_speakers)
# stage3: load the pre-trained model
# generally, we get the last model from the epoch
args.load_checkpoint = os.path.abspath(
os.path.expanduser(args.load_checkpoint))
# load model checkpoint to sid model
state_dict = paddle.load(
os.path.join(args.load_checkpoint, 'model.pdparams'))
model.set_state_dict(state_dict)
logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
# stage4: construct the enroll and test dataloader
# Now, wo think the enroll dataset is in the {args.data_dir}/vox/csv/enroll.csv,
# and the test dataset is in the {args.data_dir}/vox/csv/test.csv
enroll_dataset = CSVDataset(
os.path.join(args.data_dir, "vox/csv/enroll.csv"),
feat_type='melspectrogram',
random_chunk=False,
n_mels=config.n_mels,
window_size=config.window_size,
hop_length=config.hop_size)
enroll_sampler = BatchSampler(
enroll_dataset, batch_size=config.batch_size, shuffle=False)
enroll_loader = DataLoader(enroll_dataset,
batch_sampler=enroll_sampler,
collate_fn=lambda x: batch_feature_normalize(
x, mean_norm=True, std_norm=False),
num_workers=config.num_workers,
return_list=True,)
test_dataset = CSVDataset(
os.path.join(args.data_dir, "vox/csv/test.csv"),
feat_type='melspectrogram',
random_chunk=False,
n_mels=config.n_mels,
window_size=config.window_size,
hop_length=config.hop_size)
test_sampler = BatchSampler(
test_dataset, batch_size=config.batch_size, shuffle=False)
test_loader = DataLoader(test_dataset,
batch_sampler=test_sampler,
collate_fn=lambda x: batch_feature_normalize(
x, mean_norm=True, std_norm=False),
num_workers=config.num_workers,
return_list=True,)
# stage5: we must set the model to eval mode
model.eval()
# stage6: global embedding norm to imporve the performance
# and we create the InputNormalization instance to process the embedding mean and std norm
logger.info(f"global embedding norm: {config.global_embedding_norm}")
if config.global_embedding_norm:
mean_var_norm_emb = InputNormalization(
norm_type="global",
mean_norm=config.embedding_mean_norm,
std_norm=config.embedding_std_norm)
# stage 7: score norm need the imposters dataset
# we select the train dataset as the idea imposters dataset
# and we select the config.n_train_snts utterance to as the final imposters dataset
if "score_norm" in config:
logger.info(f"we will do score norm: {config.score_norm}")
train_dataset = CSVDataset(
os.path.join(args.data_dir, "vox/csv/train.csv"),
feat_type='melspectrogram',
n_train_snts=config.n_train_snts,
random_chunk=False,
n_mels=config.n_mels,
window_size=config.window_size,
hop_length=config.hop_size)
train_sampler = BatchSampler(
train_dataset, batch_size=config.batch_size, shuffle=False)
train_loader = DataLoader(train_dataset,
batch_sampler=train_sampler,
collate_fn=lambda x: batch_feature_normalize(
x, mean_norm=True, std_norm=False),
num_workers=config.num_workers,
return_list=True,)
# stage 8: Compute embeddings of audios in enrol and test dataset from model.
id2embedding = {}
# Run multi times to make embedding normalization more stable.
logger.info("First loop for enroll and test dataset")
compute_dataset_embedding(enroll_loader, model, mean_var_norm_emb, config,
id2embedding)
compute_dataset_embedding(test_loader, model, mean_var_norm_emb, config,
id2embedding)
logger.info("Second loop for enroll and test dataset")
compute_dataset_embedding(enroll_loader, model, mean_var_norm_emb, config,
id2embedding)
compute_dataset_embedding(test_loader, model, mean_var_norm_emb, config,
id2embedding)
mean_var_norm_emb.save(
os.path.join(args.load_checkpoint, "mean_var_norm_emb"))
# stage 9: Compute cosine scores.
train_cohort = None
if "score_norm" in config:
train_embeddings = {}
# cohort embedding not do mean and std norm
compute_dataset_embedding(train_loader, model, None, config,
train_embeddings)
train_cohort = paddle.stack(list(train_embeddings.values()))
# stage 10: compute the scores
scores, labels = compute_verification_scores(id2embedding, train_cohort,
config)
# stage 11: compute the EER and threshold
scores = paddle.to_tensor(scores)
EER, threshold = compute_eer(np.asarray(labels), scores.numpy())
logger.info(
f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}'
)
if __name__ == "__main__":
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--device',
choices=['cpu', 'gpu'],
default="gpu",
help="Select which device to train model, defaults to gpu.")
parser.add_argument("--config",
default=None,
type=str,
help="configuration file")
parser.add_argument("--data-dir",
default="./data/",
type=str,
help="data directory")
parser.add_argument("--load-checkpoint",
type=str,
default='',
help="Directory to load model checkpoint to contiune trainning.")
args = parser.parse_args()
# yapf: enable
# https://yaml.org/type/float.html
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
config.freeze()
print(config)
main(args, config)