add some comments in code

pull/1523/head
xiongxinlei 3 years ago
parent b9eafddd94
commit 9874fb7d75

@ -19,34 +19,28 @@ from typing import List
from typing import Optional from typing import Optional
from typing import Union from typing import Union
import librosa
import numpy as np
import paddle import paddle
import soundfile import soundfile
from yacs.config import CfgNode from yacs.config import CfgNode
from paddleaudio.backends import load as load_audio
from paddleaudio.compliance.librosa import melspectrogram
from ..download import get_path_from_url
from ..executor import BaseExecutor from ..executor import BaseExecutor
from ..log import logger from ..log import logger
from ..utils import cli_register from ..utils import cli_register
from ..utils import download_and_decompress from ..utils import download_and_decompress
from ..utils import MODEL_HOME from ..utils import MODEL_HOME
from ..utils import stats_wrapper from ..utils import stats_wrapper
from paddlespeech.vector.io.batch import feature_normalize from paddleaudio.backends import load as load_audio
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddleaudio.compliance.librosa import melspectrogram
from paddlespeech.s2t.transform.transformation import Transformation
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.s2t.utils.utility import UpdateConfig from paddlespeech.vector.io.batch import feature_normalize
from paddlespeech.vector.modules.sid_model import SpeakerIdetification from paddlespeech.vector.modules.sid_model import SpeakerIdetification
pretrained_models = { pretrained_models = {
# The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]". # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
# e.g. "EcapaTdnn_voxceleb12-16k". # e.g. "ecapatdnn_voxceleb12-16k".
# Command line and python api use "{model_name}[-{dataset}]" as --model, usage: # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
# "paddlespeech vector --task spk --model EcapaTdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav" # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
"EcapaTdnn_voxceleb12-16k": { "ecapatdnn_voxceleb12-16k": {
'url': 'url':
'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz', 'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz',
'md5': 'md5':
@ -59,7 +53,7 @@ pretrained_models = {
} }
model_alias = { model_alias = {
"EcapaTdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn", "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
} }
@ -75,8 +69,8 @@ class VectorExecutor(BaseExecutor):
self.parser.add_argument( self.parser.add_argument(
"--model", "--model",
type=str, type=str,
default="EcapaTdnn_voxceleb12", default="ecapatdnn_voxceleb12",
choices=["EcapaTdnn_voxceleb12"], choices=["ecapatdnn_voxceleb12"],
help="Choose model type of asr task.") help="Choose model type of asr task.")
self.parser.add_argument( self.parser.add_argument(
"--task", "--task",
@ -90,7 +84,7 @@ class VectorExecutor(BaseExecutor):
"--sample_rate", "--sample_rate",
type=int, type=int,
default=16000, default=16000,
choices=[16000, 8000], choices=[16000],
help="Choose the audio sample rate of the model. 8000 or 16000") help="Choose the audio sample rate of the model. 8000 or 16000")
self.parser.add_argument( self.parser.add_argument(
"--ckpt_path", "--ckpt_path",
@ -175,7 +169,7 @@ class VectorExecutor(BaseExecutor):
@stats_wrapper @stats_wrapper
def __call__(self, def __call__(self,
audio_file: os.PathLike, audio_file: os.PathLike,
model: str='EcapaTdnn-voxceleb12', model: str='ecapatdnn-voxceleb12',
sample_rate: int=16000, sample_rate: int=16000,
config: os.PathLike=None, config: os.PathLike=None,
ckpt_path: os.PathLike=None, ckpt_path: os.PathLike=None,
@ -197,9 +191,9 @@ class VectorExecutor(BaseExecutor):
def _get_pretrained_path(self, tag: str) -> os.PathLike: def _get_pretrained_path(self, tag: str) -> os.PathLike:
support_models = list(pretrained_models.keys()) support_models = list(pretrained_models.keys())
assert tag in pretrained_models, \ assert tag in pretrained_models, \
'The model "{}" you want to use has not been supported, \ 'The model "{}" you want to use has not been supported,'\
please choose other models.\n \ 'please choose other models.\n' \
The support models includes \n\t\t{}'.format(tag, "\n\t\t".join(support_models)) 'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models))
res_path = os.path.join(MODEL_HOME, tag) res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag], decompressed_path = download_and_decompress(pretrained_models[tag],
@ -212,7 +206,7 @@ class VectorExecutor(BaseExecutor):
return decompressed_path return decompressed_path
def _init_from_path(self, def _init_from_path(self,
model_type: str='EcapaTdnn_voxceleb12', model_type: str='ecapatdnn_voxceleb12',
sample_rate: int=16000, sample_rate: int=16000,
cfg_path: Optional[os.PathLike]=None, cfg_path: Optional[os.PathLike]=None,
ckpt_path: Optional[os.PathLike]=None): ckpt_path: Optional[os.PathLike]=None):
@ -228,8 +222,10 @@ class VectorExecutor(BaseExecutor):
res_path = self._get_pretrained_path(tag) res_path = self._get_pretrained_path(tag)
self.res_path = res_path self.res_path = res_path
self.cfg_path = os.path.join(res_path, pretrained_models[tag]['cfg_path']) self.cfg_path = os.path.join(res_path,
self.ckpt_path = os.path.join(res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams') pretrained_models[tag]['cfg_path'])
self.ckpt_path = os.path.join(
res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
else: else:
self.cfg_path = os.path.abspath(cfg_path) self.cfg_path = os.path.abspath(cfg_path)
self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams") self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
@ -269,7 +265,7 @@ class VectorExecutor(BaseExecutor):
feats = self._inputs["feats"] feats = self._inputs["feats"]
lengths = self._inputs["lengths"] lengths = self._inputs["lengths"]
logger.info(f"start to do backbone network model forward") logger.info("start to do backbone network model forward")
logger.info( logger.info(
f"feats shape:{feats.shape}, lengths shape: {lengths.shape}") f"feats shape:{feats.shape}, lengths shape: {lengths.shape}")
# embedding from (1, emb_size, 1) -> (emb_size) # embedding from (1, emb_size, 1) -> (emb_size)

@ -11,9 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import numpy
import numpy as np import numpy as np
import paddle import paddle
import numpy
def waveform_collate_fn(batch): def waveform_collate_fn(batch):
waveforms = np.stack([item['feat'] for item in batch]) waveforms = np.stack([item['feat'] for item in batch])
@ -57,6 +58,7 @@ def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
return np.pad(x, pad_width, mode=mode, **kwargs) return np.pad(x, pad_width, mode=mode, **kwargs)
def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True): def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
ids = [item['id'] for item in batch] ids = [item['id'] for item in batch]
lengths = np.asarray([item['feat'].shape[1] for item in batch]) lengths = np.asarray([item['feat'].shape[1] for item in batch])
@ -100,12 +102,11 @@ def pad_right_to(array, target_shape, mode="constant", value=0):
""" """
assert len(target_shape) == array.ndim assert len(target_shape) == array.ndim
pads = [] # this contains the abs length of the padding for each dimension. pads = [] # this contains the abs length of the padding for each dimension.
valid_vals = [] # thic contains the relative lengths for each dimension. valid_vals = [] # this contains the relative lengths for each dimension.
i = 0 # iterating over target_shape ndims i = 0 # iterating over target_shape ndims
while i < len(target_shape): while i < len(target_shape):
assert ( assert (target_shape[i] >= array.shape[i]
target_shape[i] >= array.shape[i] ), "Target shape must be >= original shape for every dim"
), "Target shape must be >= original shape for every dim"
pads.append([0, target_shape[i] - array.shape[i]]) pads.append([0, target_shape[i] - array.shape[i]])
valid_vals.append(array.shape[i] / target_shape[i]) valid_vals.append(array.shape[i] / target_shape[i])
i += 1 i += 1
@ -136,11 +137,8 @@ def batch_pad_right(arrays, mode="constant", value=0):
# if there is only one array in the batch we simply unsqueeze it. # if there is only one array in the batch we simply unsqueeze it.
return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0]) return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0])
if not ( if not (any(
any( [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))])):
[arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))]
)
):
raise IndexError("All arrays must have same number of dimensions") raise IndexError("All arrays must have same number of dimensions")
# FIXME we limit the support here: we allow padding of only the last dimension # FIXME we limit the support here: we allow padding of only the last dimension
@ -149,11 +147,9 @@ def batch_pad_right(arrays, mode="constant", value=0):
for dim in range(arrays[0].ndim): for dim in range(arrays[0].ndim):
if dim != (arrays[0].ndim - 1): if dim != (arrays[0].ndim - 1):
if not all( if not all(
[x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]] [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]):
):
raise EnvironmentError( raise EnvironmentError(
"arrays should have same dimensions except for last one" "arrays should have same dimensions except for last one")
)
max_shape.append(max([x.shape[dim] for x in arrays])) max_shape.append(max([x.shape[dim] for x in arrays]))
batched = [] batched = []
@ -161,8 +157,7 @@ def batch_pad_right(arrays, mode="constant", value=0):
for t in arrays: for t in arrays:
# for each array we apply pad_right_to # for each array we apply pad_right_to
padded, valid_percent = pad_right_to( padded, valid_percent = pad_right_to(
t, max_shape, mode=mode, value=value t, max_shape, mode=mode, value=value)
)
batched.append(padded) batched.append(padded)
valid.append(valid_percent[-1]) valid.append(valid_percent[-1])

@ -24,7 +24,8 @@ class SpeakerIdetification(nn.Layer):
lin_blocks=0, lin_blocks=0,
lin_neurons=192, lin_neurons=192,
dropout=0.1, ): dropout=0.1, ):
"""_summary_ """The speaker identification model, which includes the speaker backbone network
and the a linear transform to speaker class num in training
Args: Args:
backbone (Paddle.nn.Layer class): the speaker identification backbone network model backbone (Paddle.nn.Layer class): the speaker identification backbone network model
@ -63,14 +64,14 @@ class SpeakerIdetification(nn.Layer):
including the speaker embedding model and the classifier model network including the speaker embedding model and the classifier model network
Args: Args:
x (Paddle.Tensor): input audio feats, x (paddle.Tensor): input audio feats,
shape=[batch, dimension, times] shape=[batch, dimension, times]
lengths (_type_, optional): input audio length. lengths (paddle.Tensor, optional): input audio length.
shape=[batch, times] shape=[batch, times]
Defaults to None. Defaults to None.
Returns: Returns:
_type_: _description_ paddle.Tensor: return the logits of the feats
""" """
# x.shape: (N, C, L) # x.shape: (N, C, L)
x = self.backbone(x, lengths).squeeze( x = self.backbone(x, lengths).squeeze(

Loading…
Cancel
Save