add speaker verification using cosine score, test=doc

4 years ago · 97ec01260b
parent 1f74af110b
commit 97ec01260b
2 changed files with 267 additions and 5 deletions
--- a/examples/voxceleb/sv0/local/speaker_verification_cosine.py
+++ b/examples/voxceleb/sv0/local/speaker_verification_cosine.py
@ -0,0 +1,238 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import ast
+import os
+
+import numpy as np
+import paddle
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+import paddle.nn.functional as F
+from paddlespeech.vector.training.metrics import compute_eer
+from paddleaudio.datasets.voxceleb import VoxCeleb1
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.training.sid_model import SpeakerIdetification
+from tqdm import tqdm
+
+
+def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
+    x = np.asarray(x)
+    assert len(
+        x.shape) == 2, f'Only 2D arrays supported, but got shape: {x.shape}'
+
+    w = target_length - x.shape[axis]
+    assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[axis]}'
+
+    if axis == 0:
+        pad_width = [[0, w], [0, 0]]
+    else:
+        pad_width = [[0, 0], [0, w]]
+
+    return np.pad(x, pad_width, mode=mode, **kwargs)
+
+
+def feature_normalize(batch, mean_norm: bool = True, std_norm: bool = True):
+    ids = [item['id'] for item in batch]
+    lengths = np.asarray([item['feat'].shape[1] for item in batch])
+    feats = list(
+        map(lambda x: pad_right_2d(x, lengths.max()),
+            [item['feat'] for item in batch]))
+    feats = np.stack(feats)
+
+    # Features normalization if needed
+    for i in range(len(feats)):
+        feat = feats[i][:, :lengths[i]]  # Excluding pad values.
+        mean = feat.mean(axis=-1, keepdims=True) if mean_norm else 0
+        std = feat.std(axis=-1, keepdims=True) if std_norm else 1
+        feats[i][:, :lengths[i]] = (feat - mean) / std
+        assert feats[i][:, lengths[i]:].sum(
+        ) == 0  # Padding valus should all be 0.
+
+    # Converts into ratios.
+    lengths = (lengths / lengths.max()).astype(np.float32)
+
+    return {'ids': ids, 'feats': feats, 'lengths': lengths}
+
+
+def main(args):
+    # stage0: set the training device, cpu or gpu
+    paddle.set_device(args.device)
+
+    # stage1: build the dnn backbone model network
+    ##"channels": [1024, 1024, 1024, 1024, 3072],
+    model_conf = {
+        "input_size": 80,
+        "channels": [512, 512, 512, 512, 1536],
+        "kernel_sizes": [5, 3, 3, 3, 1],
+        "dilations": [1, 2, 3, 4, 1],
+        "attention_channels": 128,
+        "lin_neurons": 192,
+    }
+    ecapa_tdnn = EcapaTdnn(**model_conf)
+
+    # stage2: build the speaker verification eval instance with backbone model
+    model = SpeakerIdetification(
+        backbone=ecapa_tdnn, num_class=VoxCeleb1.num_speakers)
+
+    # stage3: load the pre-trained model
+    args.load_checkpoint = os.path.abspath(
+        os.path.expanduser(args.load_checkpoint))
+
+    # load model checkpoint to sid model
+    state_dict = paddle.load(
+        os.path.join(args.load_checkpoint, 'model.pdparams'))
+    model.set_state_dict(state_dict)
+    print(f'Checkpoint loaded from {args.load_checkpoint}')
+
+    # stage4: construct the enroll and test dataloader
+    enrol_ds = VoxCeleb1(subset='enrol',
+                        feat_type='melspectrogram',
+                        random_chunk=False,
+                        n_mels=80,
+                        window_size=400,
+                        hop_length=160)
+    enrol_sampler = BatchSampler(
+                    enrol_ds, 
+                    batch_size=args.batch_size,
+                    shuffle=True)  # Shuffle to make embedding normalization more robust.
+    enrol_loader = DataLoader(enrol_ds,
+                    batch_sampler=enrol_sampler,
+                    collate_fn=lambda x: feature_normalize(
+                            x, mean_norm=True, std_norm=False),
+                    num_workers=args.num_workers,
+                    return_list=True,)
+
+    test_ds = VoxCeleb1(subset='test',
+                        feat_type='melspectrogram',
+                        random_chunk=False,
+                        n_mels=80,
+                        window_size=400,
+                        hop_length=160)
+
+    test_sampler = BatchSampler(test_ds, 
+                                batch_size=args.batch_size, 
+                                shuffle=True)
+    test_loader = DataLoader(test_ds,
+                            batch_sampler=test_sampler,
+                            collate_fn=lambda x: feature_normalize(
+                                x, mean_norm=True, std_norm=False),
+                            num_workers=args.num_workers,
+                            return_list=True,)
+    # stage6: we must set the model to eval mode
+    model.eval()
+
+    # stage7: global embedding norm to imporve the performance
+    if args.global_embedding_norm:
+        embedding_mean = None
+        embedding_std = None
+        mean_norm = args.embedding_mean_norm
+        std_norm = args.embedding_std_norm
+        batch_count = 0
+
+    # stage8: Compute embeddings of audios in enrol and test dataset from model.
+    id2embedding = {}
+    # Run multi times to make embedding normalization more stable.
+    for i in range(2):
+        for dl in [enrol_loader, test_loader]:
+            print(
+                f'Loop {[i+1]}: Computing embeddings on {dl.dataset.subset} dataset'
+            )
+            with paddle.no_grad():
+                for batch_idx, batch in enumerate(tqdm(dl)):
+
+                    # stage 8-1: extrac the audio embedding
+                    ids, feats, lengths = batch['ids'], batch['feats'], batch[
+                        'lengths']
+                    embeddings = model.backbone(feats, lengths).squeeze(
+                        -1).numpy()  # (N, emb_size, 1) -> (N, emb_size)
+
+                    # Global embedding normalization.
+                    if args.global_embedding_norm:
+                        batch_count += 1
+                        mean = embeddings.mean(axis=0) if mean_norm else 0
+                        std = embeddings.std(axis=0) if std_norm else 1
+                        # Update global mean and std.
+                        if embedding_mean is None and embedding_std is None:
+                            embedding_mean, embedding_std = mean, std
+                        else:
+                            weight = 1 / batch_count  # Weight decay by batches.
+                            embedding_mean = (
+                                1 - weight) * embedding_mean + weight * mean
+                            embedding_std = (
+                                1 - weight) * embedding_std + weight * std
+                        # Apply global embedding normalization.
+                        embeddings = (embeddings - embedding_mean) / embedding_std
+
+                    # Update embedding dict.
+                    id2embedding.update(dict(zip(ids, embeddings)))
+
+    # stage 9: Compute cosine scores.
+    labels = []
+    enrol_ids = []
+    test_ids = []
+    with open(VoxCeleb1.veri_test_file, 'r') as f:
+        for line in f.readlines():
+            label, enrol_id, test_id = line.strip().split(' ')
+            labels.append(int(label))
+            enrol_ids.append(enrol_id.split('.')[0].replace('/', '-'))
+            test_ids.append(test_id.split('.')[0].replace('/', '-'))
+
+    cos_sim_func = paddle.nn.CosineSimilarity(axis=1)
+    enrol_embeddings, test_embeddings = map(lambda ids: paddle.to_tensor(
+        np.asarray([id2embedding[id] for id in ids], dtype='float32')),
+                                            [enrol_ids, test_ids
+                                             ])  # (N, emb_size)
+    scores = cos_sim_func(enrol_embeddings, test_embeddings)
+    EER, threshold = compute_eer(np.asarray(labels), scores.numpy())
+    print(
+        f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}'
+    )
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument('--device', 
+                        choices=['cpu', 'gpu'], 
+                        default="gpu", 
+                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--batch-size", 
+                        type=int, 
+                        default=16, 
+                        help="Total examples' number in batch for training.")
+    parser.add_argument("--num-workers", 
+                        type=int, 
+                        default=0, 
+                        help="Number of workers in dataloader.")
+    parser.add_argument("--load-checkpoint", 
+                        type=str, 
+                        default='', 
+                        help="Directory to load model checkpoint to contiune trainning.")
+    parser.add_argument("--global-embedding-norm", 
+                        type=bool, 
+                        default=True, 
+                        help="Apply global normalization on speaker embeddings.")
+    parser.add_argument("--embedding-mean-norm", 
+                        type=bool, 
+                        default=True, 
+                        help="Apply mean normalization on speaker embeddings.")
+    parser.add_argument("--embedding-std-norm", 
+                        type=bool, 
+                        default=False, 
+                        help="Apply std normalization on speaker embeddings.")
+    args = parser.parse_args()
+    # yapf: enable
+
+    main(args)
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@ -2,9 +2,33 @@
 . ./path.sh
 set -e

-dir=./data/
-mkdir -p ${dir}
+#######################################################################
+# stage 1: train the speaker identification model
+# stage 2: test speaker identification 
+# stage 3: extract the training embeding to train the LDA and PLDA
+######################################################################
+
 # you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
-python3 \
-     local/train.py \
-     --data-dir ${dir}
+# default the dataset is the ~/.paddleaudio/
+# export PPAUDIO_HOME=
+
+stage=2
+dir=data/                     # data directory
+exp_dir=exp/ecapa-tdnn/       # experiment directory
+mkdir -p ${dir}
+
+if [ $stage -le 1 ]; then
+     # stage 1: train the speaker identification model
+     python3 \
+          -m paddle.distributed.launch --gpus=0,1,2,3 \
+          local/train.py --device "gpu" --checkpoint-dir ${exp_dir} \
+          --save-freq 10 --data-dir ${dir} --batch-size 256 --epochs 60
+fi
+
+if [ $stage -le 2 ]; then
+     # stage 1: train the speaker identification model
+     python3 \
+          local/speaker_verification_cosine.py \
+          --load-checkpoint ${exp_dir}/epoch_40/
+fi
+