add some comments in code

3 years ago · 9874fb7d75
parent b9eafddd94
commit 9874fb7d75
3 changed files with 39 additions and 47 deletions
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@ -19,34 +19,28 @@ from typing import List
 from typing import Optional
 from typing import Union
 import librosa
 import numpy as np
 import paddle
 import soundfile
 from yacs.config import CfgNode
 from paddleaudio.backends import load as load_audio
 from paddleaudio.compliance.librosa import melspectrogram
 from ..download import get_path_from_url
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
 from ..utils import download_and_decompress
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
-from paddlespeech.vector.io.batch import feature_normalize
+from paddleaudio.backends import load as load_audio
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.transform.transformation import Transformation
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 pretrained_models = {
    # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
-    # e.g. "EcapaTdnn_voxceleb12-16k".
+    # e.g. "ecapatdnn_voxceleb12-16k".
    # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
-    # "paddlespeech vector --task spk --model EcapaTdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
+    # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
-    "EcapaTdnn_voxceleb12-16k": {
+    "ecapatdnn_voxceleb12-16k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz',
        'md5':
@ -59,7 +53,7 @@ pretrained_models = {
 }
 model_alias = {
-    "EcapaTdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
+    "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
 }
@ -75,8 +69,8 @@ class VectorExecutor(BaseExecutor):
        self.parser.add_argument(
            "--model",
            type=str,
-            default="EcapaTdnn_voxceleb12",
+            default="ecapatdnn_voxceleb12",
-            choices=["EcapaTdnn_voxceleb12"],
+            choices=["ecapatdnn_voxceleb12"],
            help="Choose model type of asr task.")
        self.parser.add_argument(
            "--task",
@ -90,7 +84,7 @@ class VectorExecutor(BaseExecutor):
            "--sample_rate",
            type=int,
            default=16000,
-            choices=[16000, 8000],
+            choices=[16000],
            help="Choose the audio sample rate of the model. 8000 or 16000")
        self.parser.add_argument(
            "--ckpt_path",
@ -175,7 +169,7 @@ class VectorExecutor(BaseExecutor):
    @stats_wrapper
    def __call__(self,
                 audio_file: os.PathLike,
-                 model: str='EcapaTdnn-voxceleb12',
+                 model: str='ecapatdnn-voxceleb12',
                 sample_rate: int=16000,
                 config: os.PathLike=None,
                 ckpt_path: os.PathLike=None,
@ -197,9 +191,9 @@ class VectorExecutor(BaseExecutor):
    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        support_models = list(pretrained_models.keys())
        assert tag in pretrained_models, \
-            'The model "{}" you want to use has not been supported, \
+            'The model "{}" you want to use has not been supported,'\
-            please choose other models.\n \
+            'please choose other models.\n' \
-            The support models includes \n\t\t{}'.format(tag, "\n\t\t".join(support_models))
+            'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models))
        res_path = os.path.join(MODEL_HOME, tag)
        decompressed_path = download_and_decompress(pretrained_models[tag],
@ -212,7 +206,7 @@ class VectorExecutor(BaseExecutor):
        return decompressed_path
    def _init_from_path(self,
-                        model_type: str='EcapaTdnn_voxceleb12',
+                        model_type: str='ecapatdnn_voxceleb12',
                        sample_rate: int=16000,
                        cfg_path: Optional[os.PathLike]=None,
                        ckpt_path: Optional[os.PathLike]=None):
@ -228,8 +222,10 @@ class VectorExecutor(BaseExecutor):
            res_path = self._get_pretrained_path(tag)
            self.res_path = res_path
-            self.cfg_path = os.path.join(res_path, pretrained_models[tag]['cfg_path'])
+            self.cfg_path = os.path.join(res_path,
-            self.ckpt_path = os.path.join(res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
+                                         pretrained_models[tag]['cfg_path'])
            self.ckpt_path = os.path.join(
                res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
        else:
            self.cfg_path = os.path.abspath(cfg_path)
            self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
@ -269,7 +265,7 @@ class VectorExecutor(BaseExecutor):
        feats = self._inputs["feats"]
        lengths = self._inputs["lengths"]
-        logger.info(f"start to do backbone network model forward")
+        logger.info("start to do backbone network model forward")
        logger.info(
            f"feats shape:{feats.shape}, lengths shape: {lengths.shape}")
        # embedding from (1, emb_size, 1) -> (emb_size)
--- a/paddlespeech/vector/io/batch.py
+++ b/paddlespeech/vector/io/batch.py
@ -11,9 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy
 import numpy as np
 import paddle
-import numpy
+
 def waveform_collate_fn(batch):
    waveforms = np.stack([item['feat'] for item in batch])
@ -57,6 +58,7 @@ def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
    return np.pad(x, pad_width, mode=mode, **kwargs)
 def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
    ids = [item['id'] for item in batch]
    lengths = np.asarray([item['feat'].shape[1] for item in batch])
@ -100,12 +102,11 @@ def pad_right_to(array, target_shape, mode="constant", value=0):
    """
    assert len(target_shape) == array.ndim
    pads = []  # this contains the abs length of the padding for each dimension.
-    valid_vals = []  # thic contains the relative lengths for each dimension.
+    valid_vals = []  # this contains the relative lengths for each dimension.
-    i = 0 # iterating over target_shape ndims
+    i = 0  # iterating over target_shape ndims
    while i < len(target_shape):
-        assert (
+        assert (target_shape[i] >= array.shape[i]
-            target_shape[i] >= array.shape[i]
+                ), "Target shape must be >= original shape for every dim"
        ), "Target shape must be >= original shape for every dim"
        pads.append([0, target_shape[i] - array.shape[i]])
        valid_vals.append(array.shape[i] / target_shape[i])
        i += 1
@ -136,11 +137,8 @@ def batch_pad_right(arrays, mode="constant", value=0):
        # if there is only one array in the batch we simply unsqueeze it.
        return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0])
-    if not (
+    if not (any(
-        any(
+        [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))])):
            [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))]
        )
    ):
        raise IndexError("All arrays must have same number of dimensions")
    # FIXME we limit the support here: we allow padding of only the last dimension
@ -149,11 +147,9 @@ def batch_pad_right(arrays, mode="constant", value=0):
    for dim in range(arrays[0].ndim):
        if dim != (arrays[0].ndim - 1):
            if not all(
-                [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]
+                [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]):
            ):
                raise EnvironmentError(
-                    "arrays should have same dimensions except for last one"
+                    "arrays should have same dimensions except for last one")
                )
        max_shape.append(max([x.shape[dim] for x in arrays]))
    batched = []
@ -161,8 +157,7 @@ def batch_pad_right(arrays, mode="constant", value=0):
    for t in arrays:
        # for each array we apply pad_right_to
        padded, valid_percent = pad_right_to(
-            t, max_shape, mode=mode, value=value
+            t, max_shape, mode=mode, value=value)
        )
        batched.append(padded)
        valid.append(valid_percent[-1])
--- a/paddlespeech/vector/modules/sid_model.py
+++ b/paddlespeech/vector/modules/sid_model.py
@ -24,7 +24,8 @@ class SpeakerIdetification(nn.Layer):
            lin_blocks=0,
            lin_neurons=192,
            dropout=0.1, ):
-        """_summary_
+        """The speaker identification model, which includes the speaker backbone network 
           and the a linear transform to speaker class num in training
        Args:
            backbone (Paddle.nn.Layer class): the speaker identification backbone network model
@ -63,14 +64,14 @@ class SpeakerIdetification(nn.Layer):
           including the speaker embedding model and the classifier model network
        Args:
-            x (Paddle.Tensor): input audio feats, 
+            x (paddle.Tensor): input audio feats, 
                               shape=[batch, dimension, times]
-            lengths (_type_, optional): input audio length.
+            lengths (paddle.Tensor, optional): input audio length.
                                        shape=[batch, times]
                                        Defaults to None.
        Returns:
-            _type_: _description_
+            paddle.Tensor: return the logits of the feats
        """
        # x.shape: (N, C, L)
        x = self.backbone(x, lengths).squeeze(