add some comments in code

pull/1523/head
xiongxinlei 3 years ago
parent b9eafddd94
commit 9874fb7d75

@ -19,34 +19,28 @@ from typing import List
from typing import Optional
from typing import Union
import librosa
import numpy as np
import paddle
import soundfile
from yacs.config import CfgNode
from paddleaudio.backends import load as load_audio
from paddleaudio.compliance.librosa import melspectrogram
from ..download import get_path_from_url
from ..executor import BaseExecutor
from ..log import logger
from ..utils import cli_register
from ..utils import download_and_decompress
from ..utils import MODEL_HOME
from ..utils import stats_wrapper
from paddlespeech.vector.io.batch import feature_normalize
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.transform.transformation import Transformation
from paddleaudio.backends import load as load_audio
from paddleaudio.compliance.librosa import melspectrogram
from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.s2t.utils.utility import UpdateConfig
from paddlespeech.vector.io.batch import feature_normalize
from paddlespeech.vector.modules.sid_model import SpeakerIdetification
pretrained_models = {
# The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
# e.g. "EcapaTdnn_voxceleb12-16k".
# e.g. "ecapatdnn_voxceleb12-16k".
# Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
# "paddlespeech vector --task spk --model EcapaTdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
"EcapaTdnn_voxceleb12-16k": {
# "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
"ecapatdnn_voxceleb12-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz',
'md5':
@ -59,7 +53,7 @@ pretrained_models = {
}
model_alias = {
"EcapaTdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
"ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
}
@ -75,8 +69,8 @@ class VectorExecutor(BaseExecutor):
self.parser.add_argument(
"--model",
type=str,
default="EcapaTdnn_voxceleb12",
choices=["EcapaTdnn_voxceleb12"],
default="ecapatdnn_voxceleb12",
choices=["ecapatdnn_voxceleb12"],
help="Choose model type of asr task.")
self.parser.add_argument(
"--task",
@ -90,7 +84,7 @@ class VectorExecutor(BaseExecutor):
"--sample_rate",
type=int,
default=16000,
choices=[16000, 8000],
choices=[16000],
help="Choose the audio sample rate of the model. 8000 or 16000")
self.parser.add_argument(
"--ckpt_path",
@ -175,7 +169,7 @@ class VectorExecutor(BaseExecutor):
@stats_wrapper
def __call__(self,
audio_file: os.PathLike,
model: str='EcapaTdnn-voxceleb12',
model: str='ecapatdnn-voxceleb12',
sample_rate: int=16000,
config: os.PathLike=None,
ckpt_path: os.PathLike=None,
@ -197,9 +191,9 @@ class VectorExecutor(BaseExecutor):
def _get_pretrained_path(self, tag: str) -> os.PathLike:
support_models = list(pretrained_models.keys())
assert tag in pretrained_models, \
'The model "{}" you want to use has not been supported, \
please choose other models.\n \
The support models includes \n\t\t{}'.format(tag, "\n\t\t".join(support_models))
'The model "{}" you want to use has not been supported,'\
'please choose other models.\n' \
'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
@ -212,7 +206,7 @@ class VectorExecutor(BaseExecutor):
return decompressed_path
def _init_from_path(self,
model_type: str='EcapaTdnn_voxceleb12',
model_type: str='ecapatdnn_voxceleb12',
sample_rate: int=16000,
cfg_path: Optional[os.PathLike]=None,
ckpt_path: Optional[os.PathLike]=None):
@ -228,8 +222,10 @@ class VectorExecutor(BaseExecutor):
res_path = self._get_pretrained_path(tag)
self.res_path = res_path
self.cfg_path = os.path.join(res_path, pretrained_models[tag]['cfg_path'])
self.ckpt_path = os.path.join(res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
self.cfg_path = os.path.join(res_path,
pretrained_models[tag]['cfg_path'])
self.ckpt_path = os.path.join(
res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
else:
self.cfg_path = os.path.abspath(cfg_path)
self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
@ -239,7 +235,7 @@ class VectorExecutor(BaseExecutor):
logger.info(f"start to read the ckpt from {self.ckpt_path}")
logger.info(f"read the config from {self.cfg_path}")
logger.info(f"get the res path {self.res_path}")
# stage 2: read and config and init the model body
self.config = CfgNode(new_allowed=True)
self.config.merge_from_file(self.cfg_path)
@ -269,7 +265,7 @@ class VectorExecutor(BaseExecutor):
feats = self._inputs["feats"]
lengths = self._inputs["lengths"]
logger.info(f"start to do backbone network model forward")
logger.info("start to do backbone network model forward")
logger.info(
f"feats shape:{feats.shape}, lengths shape: {lengths.shape}")
# embedding from (1, emb_size, 1) -> (emb_size)

@ -11,9 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy
import numpy as np
import paddle
import numpy
def waveform_collate_fn(batch):
waveforms = np.stack([item['feat'] for item in batch])
@ -57,6 +58,7 @@ def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
return np.pad(x, pad_width, mode=mode, **kwargs)
def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
ids = [item['id'] for item in batch]
lengths = np.asarray([item['feat'].shape[1] for item in batch])
@ -100,12 +102,11 @@ def pad_right_to(array, target_shape, mode="constant", value=0):
"""
assert len(target_shape) == array.ndim
pads = [] # this contains the abs length of the padding for each dimension.
valid_vals = [] # thic contains the relative lengths for each dimension.
i = 0 # iterating over target_shape ndims
valid_vals = [] # this contains the relative lengths for each dimension.
i = 0 # iterating over target_shape ndims
while i < len(target_shape):
assert (
target_shape[i] >= array.shape[i]
), "Target shape must be >= original shape for every dim"
assert (target_shape[i] >= array.shape[i]
), "Target shape must be >= original shape for every dim"
pads.append([0, target_shape[i] - array.shape[i]])
valid_vals.append(array.shape[i] / target_shape[i])
i += 1
@ -136,11 +137,8 @@ def batch_pad_right(arrays, mode="constant", value=0):
# if there is only one array in the batch we simply unsqueeze it.
return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0])
if not (
any(
[arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))]
)
):
if not (any(
[arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))])):
raise IndexError("All arrays must have same number of dimensions")
# FIXME we limit the support here: we allow padding of only the last dimension
@ -149,11 +147,9 @@ def batch_pad_right(arrays, mode="constant", value=0):
for dim in range(arrays[0].ndim):
if dim != (arrays[0].ndim - 1):
if not all(
[x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]
):
[x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]):
raise EnvironmentError(
"arrays should have same dimensions except for last one"
)
"arrays should have same dimensions except for last one")
max_shape.append(max([x.shape[dim] for x in arrays]))
batched = []
@ -161,8 +157,7 @@ def batch_pad_right(arrays, mode="constant", value=0):
for t in arrays:
# for each array we apply pad_right_to
padded, valid_percent = pad_right_to(
t, max_shape, mode=mode, value=value
)
t, max_shape, mode=mode, value=value)
batched.append(padded)
valid.append(valid_percent[-1])

@ -24,7 +24,8 @@ class SpeakerIdetification(nn.Layer):
lin_blocks=0,
lin_neurons=192,
dropout=0.1, ):
"""_summary_
"""The speaker identification model, which includes the speaker backbone network
and the a linear transform to speaker class num in training
Args:
backbone (Paddle.nn.Layer class): the speaker identification backbone network model
@ -41,7 +42,7 @@ class SpeakerIdetification(nn.Layer):
self.dropout = nn.Dropout(dropout)
else:
self.dropout = None
# construct the speaker classifer
input_size = self.backbone.emb_size
self.blocks = nn.LayerList()
@ -63,14 +64,14 @@ class SpeakerIdetification(nn.Layer):
including the speaker embedding model and the classifier model network
Args:
x (Paddle.Tensor): input audio feats,
x (paddle.Tensor): input audio feats,
shape=[batch, dimension, times]
lengths (_type_, optional): input audio length.
lengths (paddle.Tensor, optional): input audio length.
shape=[batch, times]
Defaults to None.
Returns:
_type_: _description_
paddle.Tensor: return the logits of the feats
"""
# x.shape: (N, C, L)
x = self.backbone(x, lengths).squeeze(

Loading…
Cancel
Save