From 9b5f7f71ac0732298a526689ed5dc90b3e6f8779 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 7 Apr 2022 18:07:28 +0800
Subject: [PATCH] add part ecapa-tdnn note, test=doc

---
 demos/speaker_verification/README.md     |  2 ++
 demos/speaker_verification/README_cn.md  |  2 ++
 paddlespeech/vector/models/ecapa_tdnn.py | 46 ++++++++++++++++++++++++
 3 files changed, 50 insertions(+)

diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md
index 27413bd8..7d7180ae 100644
--- a/demos/speaker_verification/README.md
+++ b/demos/speaker_verification/README.md
@@ -117,6 +117,8 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
       audio_file='./123456789.wav',
       device=paddle.get_device())
   print('Test embedding Result: \n{}'.format(test_emb))
+
+  # score range [0, 1]
   score = vector_executor.get_embeddings_score(audio_emb, test_emb)
   print(f"Eembeddings Score: {score}")
   ```
diff --git a/demos/speaker_verification/README_cn.md b/demos/speaker_verification/README_cn.md
index 068802fd..db382f29 100644
--- a/demos/speaker_verification/README_cn.md
+++ b/demos/speaker_verification/README_cn.md
@@ -115,6 +115,8 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
       audio_file='./123456789.wav',
       device=paddle.get_device())
   print('Test embedding Result: \n{}'.format(test_emb))
+
+  # score range [0, 1]
   score = vector_executor.get_embeddings_score(audio_emb, test_emb)
   print(f"Eembeddings Score: {score}")
   ```
diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py
index 0e7287cd..895ff13f 100644
--- a/paddlespeech/vector/models/ecapa_tdnn.py
+++ b/paddlespeech/vector/models/ecapa_tdnn.py
@@ -79,6 +79,20 @@ class Conv1d(nn.Layer):
             bias_attr=bias, )
 
     def forward(self, x):
+        """Do conv1d forward
+
+        Args:
+            x (paddle.Tensor): [N, C, L] input data, 
+                                N is the batch,
+                                C is the data dimension, 
+                                L is the time
+
+        Raises:
+            ValueError: only support the same padding type
+
+        Returns:
+            paddle.Tensor: the value of conv1d
+        """
         if self.padding == "same":
             x = self._manage_padding(x, self.kernel_size, self.dilation,
                                      self.stride)
@@ -88,6 +102,20 @@ class Conv1d(nn.Layer):
         return self.conv(x)
 
     def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int):
+        """Padding the input data
+
+        Args:
+            x (paddle.Tensor): [N, C, L] input data
+                                N is the batch,
+                                C is the data dimension, 
+                                L is the time
+            kernel_size (int): 1-d convolution kernel size
+            dilation (int): 1-d convolution dilation
+            stride (int): 1-d convolution stride
+
+        Returns:
+            paddle.Tensor: the padded input data
+        """
         L_in = x.shape[-1]  # Detecting input shape
         padding = self._get_padding_elem(L_in, stride, kernel_size,
                                          dilation)  # Time padding
@@ -101,6 +129,17 @@ class Conv1d(nn.Layer):
                           stride: int,
                           kernel_size: int,
                           dilation: int):
+        """Calculate the padding value in same mode
+
+        Args:
+            L_in (int): the times of the input data, 
+            stride (int): 1-d convolution stride
+            kernel_size (int): 1-d convolution kernel size
+            dilation (int): 1-d convolution stride
+
+        Returns:
+            int: return the padding value in same mode
+        """
         if stride > 1:
             n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
             L_out = stride * (n_steps - 1) + kernel_size * dilation
@@ -245,6 +284,13 @@ class SEBlock(nn.Layer):
 
 class AttentiveStatisticsPooling(nn.Layer):
     def __init__(self, channels, attention_channels=128, global_context=True):
+        """Compute the speaker verification statistics
+           The detail info is section 3.1 in https://arxiv.org/pdf/1709.01507.pdf 
+        Args:
+            channels (int): input data channel or data dimension
+            attention_channels (int, optional): attention dimension. Defaults to 128.
+            global_context (bool, optional): If use the global context information. Defaults to True.
+        """
         super().__init__()
 
         self.eps = 1e-12