From 9b5f7f71ac0732298a526689ed5dc90b3e6f8779 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Thu, 7 Apr 2022 18:07:28 +0800 Subject: [PATCH] add part ecapa-tdnn note, test=doc --- demos/speaker_verification/README.md | 2 ++ demos/speaker_verification/README_cn.md | 2 ++ paddlespeech/vector/models/ecapa_tdnn.py | 46 ++++++++++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md index 27413bd8..7d7180ae 100644 --- a/demos/speaker_verification/README.md +++ b/demos/speaker_verification/README.md @@ -117,6 +117,8 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav audio_file='./123456789.wav', device=paddle.get_device()) print('Test embedding Result: \n{}'.format(test_emb)) + + # score range [0, 1] score = vector_executor.get_embeddings_score(audio_emb, test_emb) print(f"Eembeddings Score: {score}") ``` diff --git a/demos/speaker_verification/README_cn.md b/demos/speaker_verification/README_cn.md index 068802fd..db382f29 100644 --- a/demos/speaker_verification/README_cn.md +++ b/demos/speaker_verification/README_cn.md @@ -115,6 +115,8 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav audio_file='./123456789.wav', device=paddle.get_device()) print('Test embedding Result: \n{}'.format(test_emb)) + + # score range [0, 1] score = vector_executor.get_embeddings_score(audio_emb, test_emb) print(f"Eembeddings Score: {score}") ``` diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py index 0e7287cd..895ff13f 100644 --- a/paddlespeech/vector/models/ecapa_tdnn.py +++ b/paddlespeech/vector/models/ecapa_tdnn.py @@ -79,6 +79,20 @@ class Conv1d(nn.Layer): bias_attr=bias, ) def forward(self, x): + """Do conv1d forward + + Args: + x (paddle.Tensor): [N, C, L] input data, + N is the batch, + C is the data dimension, + L is the time + + Raises: + ValueError: only support the same padding type + + Returns: + paddle.Tensor: the value of conv1d + """ if self.padding == "same": x = self._manage_padding(x, self.kernel_size, self.dilation, self.stride) @@ -88,6 +102,20 @@ class Conv1d(nn.Layer): return self.conv(x) def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int): + """Padding the input data + + Args: + x (paddle.Tensor): [N, C, L] input data + N is the batch, + C is the data dimension, + L is the time + kernel_size (int): 1-d convolution kernel size + dilation (int): 1-d convolution dilation + stride (int): 1-d convolution stride + + Returns: + paddle.Tensor: the padded input data + """ L_in = x.shape[-1] # Detecting input shape padding = self._get_padding_elem(L_in, stride, kernel_size, dilation) # Time padding @@ -101,6 +129,17 @@ class Conv1d(nn.Layer): stride: int, kernel_size: int, dilation: int): + """Calculate the padding value in same mode + + Args: + L_in (int): the times of the input data, + stride (int): 1-d convolution stride + kernel_size (int): 1-d convolution kernel size + dilation (int): 1-d convolution stride + + Returns: + int: return the padding value in same mode + """ if stride > 1: n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1) L_out = stride * (n_steps - 1) + kernel_size * dilation @@ -245,6 +284,13 @@ class SEBlock(nn.Layer): class AttentiveStatisticsPooling(nn.Layer): def __init__(self, channels, attention_channels=128, global_context=True): + """Compute the speaker verification statistics + The detail info is section 3.1 in https://arxiv.org/pdf/1709.01507.pdf + Args: + channels (int): input data channel or data dimension + attention_channels (int, optional): attention dimension. Defaults to 128. + global_context (bool, optional): If use the global context information. Defaults to True. + """ super().__init__() self.eps = 1e-12