Merge pull request #1646 from Honei/develop

[vec]add speaker verification score method
pull/1693/head
Honei 2 years ago committed by GitHub
commit f500fa8bde
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -30,6 +30,11 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
paddlespeech vector --task spk --input vec.job paddlespeech vector --task spk --input vec.job
echo -e "demo2 85236145389.wav \n demo3 85236145389.wav" | paddlespeech vector --task spk echo -e "demo2 85236145389.wav \n demo3 85236145389.wav" | paddlespeech vector --task spk
paddlespeech vector --task score --input "./85236145389.wav ./123456789.wav"
echo -e "demo4 85236145389.wav 85236145389.wav \n demo5 85236145389.wav 123456789.wav" > vec.job
paddlespeech vector --task score --input vec.job
``` ```
Usage: Usage:
@ -103,6 +108,19 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
audio_file='./85236145389.wav', audio_file='./85236145389.wav',
device=paddle.get_device()) device=paddle.get_device())
print('Audio embedding Result: \n{}'.format(audio_emb)) print('Audio embedding Result: \n{}'.format(audio_emb))
test_emb = vector_executor(
model='ecapatdnn_voxceleb12',
sample_rate=16000,
config=None, # Set `config` and `ckpt_path` to None to use pretrained model.
ckpt_path=None,
audio_file='./123456789.wav',
device=paddle.get_device())
print('Test embedding Result: \n{}'.format(test_emb))
# score range [0, 1]
score = vector_executor.get_embeddings_score(audio_emb, test_emb)
print(f"Eembeddings Score: {score}")
``` ```
Output Output
@ -149,6 +167,49 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
-3.3925855 5.079156 7.759716 4.677565 5.8457737 -3.3925855 5.079156 7.759716 4.677565 5.8457737
2.402413 7.7071047 3.9711342 -6.390043 6.1268735 2.402413 7.7071047 3.9711342 -6.390043 6.1268735
-3.7760346 -11.118123 ] -3.7760346 -11.118123 ]
# get the test embedding
Test embedding Result:
[ -1.902964 2.0690894 -8.034194 3.5472693 0.18089125
6.9085927 1.4097427 -1.9487704 -10.021278 -0.20755845
-8.04332 4.344489 2.3200977 -14.306299 5.184692
-11.55602 -3.8497238 0.6444722 1.2833948 2.6766639
0.5878921 0.7946299 1.7207596 2.5791872 14.998469
-1.3385371 15.031221 -0.8006958 1.99287 -9.52007
2.435466 4.003221 -4.33817 -4.898601 -5.304714
-18.033886 10.790787 -12.784645 -5.641755 2.9761686
-10.566622 1.4839455 6.152458 -5.7195854 2.8603241
6.112133 8.489869 5.5958056 1.2836679 -1.2293907
0.89927405 7.0288725 -2.854029 -0.9782962 5.8255906
14.905906 -5.025907 0.7866458 -4.2444224 -16.354029
10.521315 0.9604709 -3.3257897 7.144871 -13.592733
-8.568869 -1.7953678 0.26313916 10.916714 -6.9374123
1.857403 -6.2746415 2.8154466 -7.2338667 -2.293357
-0.05452765 5.4287076 5.0849075 -6.690375 -1.6183422
3.654291 0.94352573 -9.200294 -5.4749465 -3.5235846
1.3420814 4.240421 -2.772944 -2.8451524 16.311104
4.2969875 -1.762936 -12.5758915 8.595198 -0.8835239
-1.5708797 1.568961 1.1413603 3.5032008 -0.45251232
-6.786333 16.89443 5.3366146 -8.789056 0.6355629
3.2579517 -3.328322 7.5969577 0.66025066 -6.550468
-9.148656 2.020372 -0.4615173 1.1965656 -3.8764873
11.6562195 -6.0750933 12.182899 3.2218833 0.81969476
5.570001 -3.8459578 -7.205299 7.9262037 -7.6611166
-5.249467 -2.2671914 7.2658715 -13.298164 4.821147
-2.7263982 11.691089 -3.8918593 -2.838112 -1.0336838
-3.8034165 2.8536487 -5.60398 -1.1972581 1.3455094
-3.4903061 2.2408795 5.5010734 -3.970756 11.99696
-7.8858757 0.43160373 -5.5059714 4.3426995 16.322706
11.635366 0.72157705 -9.245714 -3.91465 -4.449838
-1.5716927 7.713747 -2.2430465 -6.198303 -13.481864
2.8156567 -5.7812386 5.1456156 2.7289324 -14.505571
13.270688 3.448231 -7.0659585 4.5886116 -4.466099
-0.296428 -11.463529 -2.6076477 14.110243 -6.9725137
-1.9962958 2.7119343 19.391657 0.01961198 14.607133
-1.6695905 -4.391516 1.3131028 -6.670972 -5.888604
12.0612335 5.9285784 3.3715196 1.492534 10.723728
-0.95514804 -12.085431 ]
# get the score between enroll and test
Eembeddings Score: 0.4292638301849365
``` ```
### 4.Pretrained Models ### 4.Pretrained Models

@ -29,6 +29,11 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
paddlespeech vector --task spk --input vec.job paddlespeech vector --task spk --input vec.job
echo -e "demo2 85236145389.wav \n demo3 85236145389.wav" | paddlespeech vector --task spk echo -e "demo2 85236145389.wav \n demo3 85236145389.wav" | paddlespeech vector --task spk
paddlespeech vector --task score --input "./85236145389.wav ./123456789.wav"
echo -e "demo4 85236145389.wav 85236145389.wav \n demo5 85236145389.wav 123456789.wav" > vec.job
paddlespeech vector --task score --input vec.job
``` ```
使用方法: 使用方法:
@ -101,6 +106,19 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
audio_file='./85236145389.wav', audio_file='./85236145389.wav',
device=paddle.get_device()) device=paddle.get_device())
print('Audio embedding Result: \n{}'.format(audio_emb)) print('Audio embedding Result: \n{}'.format(audio_emb))
test_emb = vector_executor(
model='ecapatdnn_voxceleb12',
sample_rate=16000,
config=None, # Set `config` and `ckpt_path` to None to use pretrained model.
ckpt_path=None,
audio_file='./123456789.wav',
device=paddle.get_device())
print('Test embedding Result: \n{}'.format(test_emb))
# score range [0, 1]
score = vector_executor.get_embeddings_score(audio_emb, test_emb)
print(f"Eembeddings Score: {score}")
``` ```
输出: 输出:
@ -146,6 +164,49 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
-3.3925855 5.079156 7.759716 4.677565 5.8457737 -3.3925855 5.079156 7.759716 4.677565 5.8457737
2.402413 7.7071047 3.9711342 -6.390043 6.1268735 2.402413 7.7071047 3.9711342 -6.390043 6.1268735
-3.7760346 -11.118123 ] -3.7760346 -11.118123 ]
# get the test embedding
Test embedding Result:
[ -1.902964 2.0690894 -8.034194 3.5472693 0.18089125
6.9085927 1.4097427 -1.9487704 -10.021278 -0.20755845
-8.04332 4.344489 2.3200977 -14.306299 5.184692
-11.55602 -3.8497238 0.6444722 1.2833948 2.6766639
0.5878921 0.7946299 1.7207596 2.5791872 14.998469
-1.3385371 15.031221 -0.8006958 1.99287 -9.52007
2.435466 4.003221 -4.33817 -4.898601 -5.304714
-18.033886 10.790787 -12.784645 -5.641755 2.9761686
-10.566622 1.4839455 6.152458 -5.7195854 2.8603241
6.112133 8.489869 5.5958056 1.2836679 -1.2293907
0.89927405 7.0288725 -2.854029 -0.9782962 5.8255906
14.905906 -5.025907 0.7866458 -4.2444224 -16.354029
10.521315 0.9604709 -3.3257897 7.144871 -13.592733
-8.568869 -1.7953678 0.26313916 10.916714 -6.9374123
1.857403 -6.2746415 2.8154466 -7.2338667 -2.293357
-0.05452765 5.4287076 5.0849075 -6.690375 -1.6183422
3.654291 0.94352573 -9.200294 -5.4749465 -3.5235846
1.3420814 4.240421 -2.772944 -2.8451524 16.311104
4.2969875 -1.762936 -12.5758915 8.595198 -0.8835239
-1.5708797 1.568961 1.1413603 3.5032008 -0.45251232
-6.786333 16.89443 5.3366146 -8.789056 0.6355629
3.2579517 -3.328322 7.5969577 0.66025066 -6.550468
-9.148656 2.020372 -0.4615173 1.1965656 -3.8764873
11.6562195 -6.0750933 12.182899 3.2218833 0.81969476
5.570001 -3.8459578 -7.205299 7.9262037 -7.6611166
-5.249467 -2.2671914 7.2658715 -13.298164 4.821147
-2.7263982 11.691089 -3.8918593 -2.838112 -1.0336838
-3.8034165 2.8536487 -5.60398 -1.1972581 1.3455094
-3.4903061 2.2408795 5.5010734 -3.970756 11.99696
-7.8858757 0.43160373 -5.5059714 4.3426995 16.322706
11.635366 0.72157705 -9.245714 -3.91465 -4.449838
-1.5716927 7.713747 -2.2430465 -6.198303 -13.481864
2.8156567 -5.7812386 5.1456156 2.7289324 -14.505571
13.270688 3.448231 -7.0659585 4.5886116 -4.466099
-0.296428 -11.463529 -2.6076477 14.110243 -6.9725137
-1.9962958 2.7119343 19.391657 0.01961198 14.607133
-1.6695905 -4.391516 1.3131028 -6.670972 -5.888604
12.0612335 5.9285784 3.3715196 1.492534 10.723728
-0.95514804 -12.085431 ]
# get the score between enroll and test
Eembeddings Score: 0.4292638301849365
``` ```
### 4.预训练模型 ### 4.预训练模型

@ -1,6 +1,9 @@
#!/bin/bash #!/bin/bash
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav
# vector # vector
paddlespeech vector --task spk --input ./85236145389.wav paddlespeech vector --task spk --input ./85236145389.wav
paddlespeech vector --task score --input "./85236145389.wav ./123456789.wav"

@ -15,6 +15,7 @@ import argparse
import os import os
import sys import sys
from collections import OrderedDict from collections import OrderedDict
from typing import Dict
from typing import List from typing import List
from typing import Optional from typing import Optional
from typing import Union from typing import Union
@ -79,7 +80,7 @@ class VectorExecutor(BaseExecutor):
"--task", "--task",
type=str, type=str,
default="spk", default="spk",
choices=["spk"], choices=["spk", "score"],
help="task type in vector domain") help="task type in vector domain")
self.parser.add_argument( self.parser.add_argument(
"--input", "--input",
@ -147,13 +148,40 @@ class VectorExecutor(BaseExecutor):
logger.info(f"task source: {task_source}") logger.info(f"task source: {task_source}")
# stage 3: process the audio one by one # stage 3: process the audio one by one
# we do action according the task type
task_result = OrderedDict() task_result = OrderedDict()
has_exceptions = False has_exceptions = False
for id_, input_ in task_source.items(): for id_, input_ in task_source.items():
try: try:
res = self(input_, model, sample_rate, config, ckpt_path, # extract the speaker audio embedding
device) if parser_args.task == "spk":
task_result[id_] = res logger.info("do vector spk task")
res = self(input_, model, sample_rate, config, ckpt_path,
device)
task_result[id_] = res
elif parser_args.task == "score":
logger.info("do vector score task")
logger.info(f"input content {input_}")
if len(input_.split()) != 2:
logger.error(
f"vector score task input {input_} wav num is not two,"
"that is {len(input_.split())}")
sys.exit(-1)
# get the enroll and test embedding
enroll_audio, test_audio = input_.split()
logger.info(
f"score task, enroll audio: {enroll_audio}, test audio: {test_audio}"
)
enroll_embedding = self(enroll_audio, model, sample_rate,
config, ckpt_path, device)
test_embedding = self(test_audio, model, sample_rate,
config, ckpt_path, device)
# get the score
res = self.get_embeddings_score(enroll_embedding,
test_embedding)
task_result[id_] = res
except Exception as e: except Exception as e:
has_exceptions = True has_exceptions = True
task_result[id_] = f'{e.__class__.__name__}: {e}' task_result[id_] = f'{e.__class__.__name__}: {e}'
@ -172,6 +200,49 @@ class VectorExecutor(BaseExecutor):
else: else:
return True return True
def _get_job_contents(
self, job_input: os.PathLike) -> Dict[str, Union[str, os.PathLike]]:
"""
Read a job input file and return its contents in a dictionary.
Refactor from the Executor._get_job_contents
Args:
job_input (os.PathLike): The job input file.
Returns:
Dict[str, str]: Contents of job input.
"""
job_contents = OrderedDict()
with open(job_input) as f:
for line in f:
line = line.strip()
if not line:
continue
k = line.split(' ')[0]
v = ' '.join(line.split(' ')[1:])
job_contents[k] = v
return job_contents
def get_embeddings_score(self, enroll_embedding, test_embedding):
"""get the enroll embedding and test embedding score
Args:
enroll_embedding (numpy.array): shape: (emb_size), enroll audio embedding
test_embedding (numpy.array): shape: (emb_size), test audio embedding
Returns:
score: the score between enroll embedding and test embedding
"""
if not hasattr(self, "score_func"):
self.score_func = paddle.nn.CosineSimilarity(axis=0)
logger.info("create the cosine score function ")
score = self.score_func(
paddle.to_tensor(enroll_embedding),
paddle.to_tensor(test_embedding))
return score.item()
@stats_wrapper @stats_wrapper
def __call__(self, def __call__(self,
audio_file: os.PathLike, audio_file: os.PathLike,

@ -79,6 +79,20 @@ class Conv1d(nn.Layer):
bias_attr=bias, ) bias_attr=bias, )
def forward(self, x): def forward(self, x):
"""Do conv1d forward
Args:
x (paddle.Tensor): [N, C, L] input data,
N is the batch,
C is the data dimension,
L is the time
Raises:
ValueError: only support the same padding type
Returns:
paddle.Tensor: the value of conv1d
"""
if self.padding == "same": if self.padding == "same":
x = self._manage_padding(x, self.kernel_size, self.dilation, x = self._manage_padding(x, self.kernel_size, self.dilation,
self.stride) self.stride)
@ -88,6 +102,20 @@ class Conv1d(nn.Layer):
return self.conv(x) return self.conv(x)
def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int): def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int):
"""Padding the input data
Args:
x (paddle.Tensor): [N, C, L] input data
N is the batch,
C is the data dimension,
L is the time
kernel_size (int): 1-d convolution kernel size
dilation (int): 1-d convolution dilation
stride (int): 1-d convolution stride
Returns:
paddle.Tensor: the padded input data
"""
L_in = x.shape[-1] # Detecting input shape L_in = x.shape[-1] # Detecting input shape
padding = self._get_padding_elem(L_in, stride, kernel_size, padding = self._get_padding_elem(L_in, stride, kernel_size,
dilation) # Time padding dilation) # Time padding
@ -101,6 +129,17 @@ class Conv1d(nn.Layer):
stride: int, stride: int,
kernel_size: int, kernel_size: int,
dilation: int): dilation: int):
"""Calculate the padding value in same mode
Args:
L_in (int): the times of the input data,
stride (int): 1-d convolution stride
kernel_size (int): 1-d convolution kernel size
dilation (int): 1-d convolution stride
Returns:
int: return the padding value in same mode
"""
if stride > 1: if stride > 1:
n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1) n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
L_out = stride * (n_steps - 1) + kernel_size * dilation L_out = stride * (n_steps - 1) + kernel_size * dilation
@ -245,6 +284,13 @@ class SEBlock(nn.Layer):
class AttentiveStatisticsPooling(nn.Layer): class AttentiveStatisticsPooling(nn.Layer):
def __init__(self, channels, attention_channels=128, global_context=True): def __init__(self, channels, attention_channels=128, global_context=True):
"""Compute the speaker verification statistics
The detail info is section 3.1 in https://arxiv.org/pdf/1709.01507.pdf
Args:
channels (int): input data channel or data dimension
attention_channels (int, optional): attention dimension. Defaults to 128.
global_context (bool, optional): If use the global context information. Defaults to True.
"""
super().__init__() super().__init__()
self.eps = 1e-12 self.eps = 1e-12

Loading…
Cancel
Save