Merge pull request #756 from PaddlePaddle/filter

test w/ all example & fix ctc api & add new io
pull/767/head
Hui Zhang 3 years ago committed by GitHub
commit f05f367cc5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because it is too large Load Diff

@ -3431,7 +3431,7 @@
" convolution_layer_args = (output_size, cnn_module_kernel, activation,\n", " convolution_layer_args = (output_size, cnn_module_kernel, activation,\n",
" cnn_module_norm, causal)\n", " cnn_module_norm, causal)\n",
"\n", "\n",
" self.encoders = nn.ModuleList([\n", " self.encoders = nn.LayerList([\n",
" ConformerEncoderLayer(\n", " ConformerEncoderLayer(\n",
" size=output_size,\n", " size=output_size,\n",
" self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),\n", " self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),\n",

@ -30,24 +30,13 @@ logger = Log(__name__).getlog()
logger.warn = logger.warning logger.warn = logger.warning
########### hcak paddle ############# ########### hcak paddle #############
paddle.bool = 'bool'
paddle.float16 = 'float16'
paddle.half = 'float16' paddle.half = 'float16'
paddle.float32 = 'float32'
paddle.float = 'float32' paddle.float = 'float32'
paddle.float64 = 'float64'
paddle.double = 'float64' paddle.double = 'float64'
paddle.int8 = 'int8'
paddle.int16 = 'int16'
paddle.short = 'int16' paddle.short = 'int16'
paddle.int32 = 'int32'
paddle.int = 'int32' paddle.int = 'int32'
paddle.int64 = 'int64'
paddle.long = 'int64' paddle.long = 'int64'
paddle.uint8 = 'uint8'
paddle.uint16 = 'uint16' paddle.uint16 = 'uint16'
paddle.complex64 = 'complex64'
paddle.complex128 = 'complex128'
paddle.cdouble = 'complex128' paddle.cdouble = 'complex128'
@ -403,45 +392,7 @@ if not hasattr(paddle.nn.functional, 'glu'):
# return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0))) # return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
# hack loss
def ctc_loss(logits,
labels,
input_lengths,
label_lengths,
blank=0,
reduction='mean',
norm_by_times=True):
#logger.info("my ctc loss with norm by times")
## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403
loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times,
input_lengths, label_lengths)
loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
assert reduction in ['mean', 'sum', 'none']
if reduction == 'mean':
loss_out = paddle.mean(loss_out / label_lengths)
elif reduction == 'sum':
loss_out = paddle.sum(loss_out)
return loss_out
logger.warn(
"override ctc_loss of paddle.nn.functional if exists, remove this when fixed!"
)
F.ctc_loss = ctc_loss
########### hcak paddle.nn ############# ########### hcak paddle.nn #############
if not hasattr(paddle.nn, 'Module'):
logger.warn("register user Module to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'Module', paddle.nn.Layer)
# maybe cause assert isinstance(sublayer, core.Layer)
if not hasattr(paddle.nn, 'ModuleList'):
logger.warn(
"register user ModuleList to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'ModuleList', paddle.nn.LayerList)
class GLU(nn.Layer): class GLU(nn.Layer):
"""Gated Linear Units (GLU) Layer""" """Gated Linear Units (GLU) Layer"""

@ -264,12 +264,12 @@ class U2Trainer(Trainer):
config.data.manifest = config.data.test_manifest config.data.manifest = config.data.test_manifest
# filter test examples, will cause less examples, but no mismatch with training # filter test examples, will cause less examples, but no mismatch with training
# and can use large batch size , save training time, so filter test egs now. # and can use large batch size , save training time, so filter test egs now.
# config.data.min_input_len = 0.0 # second config.data.min_input_len = 0.0 # second
# config.data.max_input_len = float('inf') # second config.data.max_input_len = float('inf') # second
# config.data.min_output_len = 0.0 # tokens config.data.min_output_len = 0.0 # tokens
# config.data.max_output_len = float('inf') # tokens config.data.max_output_len = float('inf') # tokens
# config.data.min_output_input_ratio = 0.00 config.data.min_output_input_ratio = 0.00
# config.data.max_output_input_ratio = float('inf') config.data.max_output_input_ratio = float('inf')
test_dataset = ManifestDataset.from_config(config) test_dataset = ManifestDataset.from_config(config)
# return text ord id # return text ord id

@ -13,18 +13,28 @@
# limitations under the License. # limitations under the License.
"""Contains the data augmentation pipeline.""" """Contains the data augmentation pipeline."""
import json import json
from collections.abc import Sequence
from inspect import signature
import numpy as np import numpy as np
from deepspeech.frontend.augmentor.impulse_response import ImpulseResponseAugmentor from deepspeech.frontend.augmentor.base import AugmentorBase
from deepspeech.frontend.augmentor.noise_perturb import NoisePerturbAugmentor from deepspeech.utils.dynamic_import import dynamic_import
from deepspeech.frontend.augmentor.online_bayesian_normalization import \ from deepspeech.utils.log import Log
OnlineBayesianNormalizationAugmentor
from deepspeech.frontend.augmentor.resample import ResampleAugmentor __all__ = ["AugmentationPipeline"]
from deepspeech.frontend.augmentor.shift_perturb import ShiftPerturbAugmentor
from deepspeech.frontend.augmentor.spec_augment import SpecAugmentor logger = Log(__name__).getlog()
from deepspeech.frontend.augmentor.speed_perturb import SpeedPerturbAugmentor
from deepspeech.frontend.augmentor.volume_perturb import VolumePerturbAugmentor import_alias = dict(
volume="deepspeech.frontend.augmentor.impulse_response:VolumePerturbAugmentor",
shift="deepspeech.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor",
speed="deepspeech.frontend.augmentor.speed_perturb:SpeedPerturbAugmentor",
resample="deepspeech.frontend.augmentor.resample:ResampleAugmentor",
bayesian_normal="deepspeech.frontend.augmentor.online_bayesian_normalization:OnlineBayesianNormalizationAugmentor",
noise="deepspeech.frontend.augmentor.noise_perturb:NoisePerturbAugmentor",
impulse="deepspeech.frontend.augmentor.impulse_response:ImpulseResponseAugmentor",
specaug="deepspeech.frontend.augmentor.spec_augment:SpecAugmentor", )
class AugmentationPipeline(): class AugmentationPipeline():
@ -78,20 +88,74 @@ class AugmentationPipeline():
augmentor to take effect. If "prob" is zero, the augmentor does not take augmentor to take effect. If "prob" is zero, the augmentor does not take
effect. effect.
:param augmentation_config: Augmentation configuration in json string. Params:
:type augmentation_config: str augmentation_config(str): Augmentation configuration in json string.
:param random_seed: Random seed. random_seed(int): Random seed.
:type random_seed: int train(bool): whether is train mode.
:raises ValueError: If the augmentation json config is in incorrect format".
Raises:
ValueError: If the augmentation json config is in incorrect format".
""" """
def __init__(self, augmentation_config: str, random_seed=0): def __init__(self, augmentation_config: str, random_seed: int=0):
self._rng = np.random.RandomState(random_seed) self._rng = np.random.RandomState(random_seed)
self._spec_types = ('specaug') self._spec_types = ('specaug')
self._augmentors, self._rates = self._parse_pipeline_from(
augmentation_config, 'audio') if augmentation_config is None:
self.conf = {}
else:
self.conf = json.loads(augmentation_config)
self._augmentors, self._rates = self._parse_pipeline_from('all')
self._audio_augmentors, self._audio_rates = self._parse_pipeline_from(
'audio')
self._spec_augmentors, self._spec_rates = self._parse_pipeline_from( self._spec_augmentors, self._spec_rates = self._parse_pipeline_from(
augmentation_config, 'feature') 'feature')
def __call__(self, xs, uttid_list=None, **kwargs):
if not isinstance(xs, Sequence):
is_batch = False
xs = [xs]
else:
is_batch = True
if isinstance(uttid_list, str):
uttid_list = [uttid_list for _ in range(len(xs))]
if self.conf.get("mode", "sequential") == "sequential":
for idx, (func, rate) in enumerate(
zip(self._augmentors, self._rates), 0):
if self._rng.uniform(0., 1.) >= rate:
continue
# Derive only the args which the func has
try:
param = signature(func).parameters
except ValueError:
# Some function, e.g. built-in function, are failed
param = {}
_kwargs = {k: v for k, v in kwargs.items() if k in param}
try:
if uttid_list is not None and "uttid" in param:
xs = [
func(x, u, **_kwargs)
for x, u in zip(xs, uttid_list)
]
else:
xs = [func(x, **_kwargs) for x in xs]
except Exception:
logger.fatal("Catch a exception from {}th func: {}".format(
idx, func))
raise
else:
raise NotImplementedError(
"Not supporting mode={}".format(self.conf["mode"]))
if is_batch:
return xs
else:
return xs[0]
def transform_audio(self, audio_segment): def transform_audio(self, audio_segment):
"""Run the pre-processing pipeline for data augmentation. """Run the pre-processing pipeline for data augmentation.
@ -101,7 +165,7 @@ class AugmentationPipeline():
:param audio_segment: Audio segment to process. :param audio_segment: Audio segment to process.
:type audio_segment: AudioSegmenet|SpeechSegment :type audio_segment: AudioSegmenet|SpeechSegment
""" """
for augmentor, rate in zip(self._augmentors, self._rates): for augmentor, rate in zip(self._audio_augmentors, self._audio_rates):
if self._rng.uniform(0., 1.) < rate: if self._rng.uniform(0., 1.) < rate:
augmentor.transform_audio(audio_segment) augmentor.transform_audio(audio_segment)
@ -116,14 +180,14 @@ class AugmentationPipeline():
spec_segment = augmentor.transform_feature(spec_segment) spec_segment = augmentor.transform_feature(spec_segment)
return spec_segment return spec_segment
def _parse_pipeline_from(self, config_json, aug_type='audio'): def _parse_pipeline_from(self, aug_type='all'):
"""Parse the config json to build a augmentation pipelien.""" """Parse the config json to build a augmentation pipelien."""
assert aug_type in ('audio', 'feature'), aug_type assert aug_type in ('audio', 'feature', 'all'), aug_type
try:
configs = json.loads(config_json)
audio_confs = [] audio_confs = []
feature_confs = [] feature_confs = []
for config in configs: all_confs = []
for config in self.conf:
all_confs.append(config)
if config["type"] in self._spec_types: if config["type"] in self._spec_types:
feature_confs.append(config) feature_confs.append(config)
else: else:
@ -133,35 +197,22 @@ class AugmentationPipeline():
aug_confs = audio_confs aug_confs = audio_confs
elif aug_type == 'feature': elif aug_type == 'feature':
aug_confs = feature_confs aug_confs = feature_confs
else:
aug_confs = all_confs
augmentors = [ augmentors = [
self._get_augmentor(config["type"], config["params"]) self._get_augmentor(config["type"], config["params"])
for config in aug_confs for config in aug_confs
] ]
rates = [config["prob"] for config in aug_confs] rates = [config["prob"] for config in aug_confs]
except Exception as e:
raise ValueError("Failed to parse the augmentation config json: "
"%s" % str(e))
return augmentors, rates return augmentors, rates
def _get_augmentor(self, augmentor_type, params): def _get_augmentor(self, augmentor_type, params):
"""Return an augmentation model by the type name, and pass in params.""" """Return an augmentation model by the type name, and pass in params."""
if augmentor_type == "volume": class_obj = dynamic_import(augmentor_type, import_alias)
return VolumePerturbAugmentor(self._rng, **params) assert issubclass(class_obj, AugmentorBase)
elif augmentor_type == "shift": try:
return ShiftPerturbAugmentor(self._rng, **params) obj = class_obj(self._rng, **params)
elif augmentor_type == "speed": except Exception:
return SpeedPerturbAugmentor(self._rng, **params)
elif augmentor_type == "resample":
return ResampleAugmentor(self._rng, **params)
elif augmentor_type == "bayesian_normal":
return OnlineBayesianNormalizationAugmentor(self._rng, **params)
elif augmentor_type == "noise":
return NoisePerturbAugmentor(self._rng, **params)
elif augmentor_type == "impulse":
return ImpulseResponseAugmentor(self._rng, **params)
elif augmentor_type == "specaug":
return SpecAugmentor(self._rng, **params)
else:
raise ValueError("Unknown augmentor type [%s]." % augmentor_type) raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
return obj

@ -28,6 +28,10 @@ class AugmentorBase():
def __init__(self): def __init__(self):
pass pass
@abstractmethod
def __call__(self, xs):
raise NotImplementedError
@abstractmethod @abstractmethod
def transform_audio(self, audio_segment): def transform_audio(self, audio_segment):
"""Adds various effects to the input audio segment. Such effects """Adds various effects to the input audio segment. Such effects

@ -30,6 +30,11 @@ class ImpulseResponseAugmentor(AugmentorBase):
self._rng = rng self._rng = rng
self._impulse_manifest = read_manifest(impulse_manifest_path) self._impulse_manifest = read_manifest(impulse_manifest_path)
def __call__(self, x, uttid=None, train=True):
if not train:
return
self.transform_audio(x)
def transform_audio(self, audio_segment): def transform_audio(self, audio_segment):
"""Add impulse response effect. """Add impulse response effect.

@ -36,6 +36,11 @@ class NoisePerturbAugmentor(AugmentorBase):
self._rng = rng self._rng = rng
self._noise_manifest = read_manifest(manifest_path=noise_manifest_path) self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
def __call__(self, x, uttid=None, train=True):
if not train:
return
self.transform_audio(x)
def transform_audio(self, audio_segment): def transform_audio(self, audio_segment):
"""Add background noise audio. """Add background noise audio.

@ -44,6 +44,11 @@ class OnlineBayesianNormalizationAugmentor(AugmentorBase):
self._rng = rng self._rng = rng
self._startup_delay = startup_delay self._startup_delay = startup_delay
def __call__(self, x, uttid=None, train=True):
if not train:
return
self.transform_audio(x)
def transform_audio(self, audio_segment): def transform_audio(self, audio_segment):
"""Normalizes the input audio using the online Bayesian approach. """Normalizes the input audio using the online Bayesian approach.

@ -31,6 +31,11 @@ class ResampleAugmentor(AugmentorBase):
self._new_sample_rate = new_sample_rate self._new_sample_rate = new_sample_rate
self._rng = rng self._rng = rng
def __call__(self, x, uttid=None, train=True):
if not train:
return
self.transform_audio(x)
def transform_audio(self, audio_segment): def transform_audio(self, audio_segment):
"""Resamples the input audio to a target sample rate. """Resamples the input audio to a target sample rate.

@ -31,6 +31,11 @@ class ShiftPerturbAugmentor(AugmentorBase):
self._max_shift_ms = max_shift_ms self._max_shift_ms = max_shift_ms
self._rng = rng self._rng = rng
def __call__(self, x, uttid=None, train=True):
if not train:
return
self.transform_audio(x)
def transform_audio(self, audio_segment): def transform_audio(self, audio_segment):
"""Shift audio. """Shift audio.

@ -157,6 +157,11 @@ class SpecAugmentor(AugmentorBase):
self._time_mask = (t_0, t_0 + t) self._time_mask = (t_0, t_0 + t)
return xs return xs
def __call__(self, x, train=True):
if not train:
return
self.transform_audio(x)
def transform_feature(self, xs: np.ndarray): def transform_feature(self, xs: np.ndarray):
""" """
Args: Args:

@ -79,6 +79,11 @@ class SpeedPerturbAugmentor(AugmentorBase):
self._rates = np.linspace( self._rates = np.linspace(
self._min_rate, self._max_rate, self._num_rates, endpoint=True) self._min_rate, self._max_rate, self._num_rates, endpoint=True)
def __call__(self, x, uttid=None, train=True):
if not train:
return
self.transform_audio(x)
def transform_audio(self, audio_segment): def transform_audio(self, audio_segment):
"""Sample a new speed rate from the given range and """Sample a new speed rate from the given range and
changes the speed of the given audio clip. changes the speed of the given audio clip.

@ -37,6 +37,11 @@ class VolumePerturbAugmentor(AugmentorBase):
self._max_gain_dBFS = max_gain_dBFS self._max_gain_dBFS = max_gain_dBFS
self._rng = rng self._rng = rng
def __call__(self, x, uttid=None, train=True):
if not train:
return
self.transform_audio(x)
def transform_audio(self, audio_segment): def transform_audio(self, audio_segment):
"""Change audio loadness. """Change audio loadness.

@ -11,139 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import numpy as np
from paddle.io import DataLoader
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset
from deepspeech.io.sampler import SortagradBatchSampler
from deepspeech.io.sampler import SortagradDistributedBatchSampler
def create_dataloader(manifest_path,
unit_type,
vocab_filepath,
mean_std_filepath,
spm_model_prefix,
augmentation_config='{}',
max_input_len=float('inf'),
min_input_len=0.0,
max_output_len=float('inf'),
min_output_len=0.0,
max_output_input_ratio=float('inf'),
min_output_input_ratio=0.0,
stride_ms=10.0,
window_ms=20.0,
max_freq=None,
specgram_type='linear',
feat_dim=None,
delta_delta=False,
use_dB_normalization=True,
random_seed=0,
keep_transcription_text=False,
is_training=False,
batch_size=1,
num_workers=0,
sortagrad=False,
shuffle_method=None,
dist=False):
dataset = ManifestDataset(
manifest_path=manifest_path,
unit_type=unit_type,
vocab_filepath=vocab_filepath,
mean_std_filepath=mean_std_filepath,
spm_model_prefix=spm_model_prefix,
augmentation_config=augmentation_config,
max_input_len=max_input_len,
min_input_len=min_input_len,
max_output_len=max_output_len,
min_output_len=min_output_len,
max_output_input_ratio=max_output_input_ratio,
min_output_input_ratio=min_output_input_ratio,
stride_ms=stride_ms,
window_ms=window_ms,
max_freq=max_freq,
specgram_type=specgram_type,
feat_dim=feat_dim,
delta_delta=delta_delta,
use_dB_normalization=use_dB_normalization,
random_seed=random_seed,
keep_transcription_text=keep_transcription_text)
if dist:
batch_sampler = SortagradDistributedBatchSampler(
dataset,
batch_size,
num_replicas=None,
rank=None,
shuffle=is_training,
drop_last=is_training,
sortagrad=is_training,
shuffle_method=shuffle_method)
else:
batch_sampler = SortagradBatchSampler(
dataset,
shuffle=is_training,
batch_size=batch_size,
drop_last=is_training,
sortagrad=is_training,
shuffle_method=shuffle_method)
def padding_batch(batch,
padding_to=-1,
flatten=False,
keep_transcription_text=True):
"""
Padding audio features with zeros to make them have the same shape (or
a user-defined shape) within one bach.
If ``padding_to`` is -1, the maximun shape in the batch will be used
as the target shape for padding. Otherwise, `padding_to` will be the
target shape (only refers to the second axis).
If `flatten` is True, features will be flatten to 1darray.
"""
new_batch = []
# get target shape
max_length = max([audio.shape[1] for audio, text in batch])
if padding_to != -1:
if padding_to < max_length:
raise ValueError("If padding_to is not -1, it should be larger "
"than any instance's shape in the batch")
max_length = padding_to
max_text_length = max([len(text) for audio, text in batch])
# padding
padded_audios = []
audio_lens = []
texts, text_lens = [], []
for audio, text in batch:
padded_audio = np.zeros([audio.shape[0], max_length])
padded_audio[:, :audio.shape[1]] = audio
if flatten:
padded_audio = padded_audio.flatten()
padded_audios.append(padded_audio)
audio_lens.append(audio.shape[1])
padded_text = np.zeros([max_text_length])
if keep_transcription_text:
padded_text[:len(text)] = [ord(t) for t in text] # string
else:
padded_text[:len(text)] = text # ids
texts.append(padded_text)
text_lens.append(len(text))
padded_audios = np.array(padded_audios).astype('float32')
audio_lens = np.array(audio_lens).astype('int64')
texts = np.array(texts).astype('int32')
text_lens = np.array(text_lens).astype('int64')
return padded_audios, audio_lens, texts, text_lens
# collate_fn=functools.partial(padding_batch, keep_transcription_text=keep_transcription_text),
collate_fn = SpeechCollator(keep_transcription_text=keep_transcription_text)
loader = DataLoader(
dataset,
batch_sampler=batch_sampler,
collate_fn=collate_fn,
num_workers=num_workers)
return loader

@ -0,0 +1,469 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import numpy as np
from deepspeech.utils.log import Log
__all__ = ["make_batchset"]
logger = Log(__name__).getlog()
def batchfy_by_seq(
sorted_data,
batch_size,
max_length_in,
max_length_out,
min_batch_size=1,
shortest_first=False,
ikey="input",
iaxis=0,
okey="output",
oaxis=0, ):
"""Make batch set from json dictionary
:param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
:param int batch_size: batch size
:param int max_length_in: maximum length of input to decide adaptive batch size
:param int max_length_out: maximum length of output to decide adaptive batch size
:param int min_batch_size: mininum batch size (for multi-gpu)
:param bool shortest_first: Sort from batch with shortest samples
to longest if true, otherwise reverse
:param str ikey: key to access input
(for ASR ikey="input", for TTS, MT ikey="output".)
:param int iaxis: dimension to access input
(for ASR, TTS iaxis=0, for MT iaxis="1".)
:param str okey: key to access output
(for ASR, MT okey="output". for TTS okey="input".)
:param int oaxis: dimension to access output
(for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.)
:return: List[List[Tuple[str, dict]]] list of batches
"""
if batch_size <= 0:
raise ValueError(f"Invalid batch_size={batch_size}")
# check #utts is more than min_batch_size
if len(sorted_data) < min_batch_size:
raise ValueError(
f"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size})."
)
# make list of minibatches
minibatches = []
start = 0
while True:
_, info = sorted_data[start]
ilen = int(info[ikey][iaxis]["shape"][0])
olen = (int(info[okey][oaxis]["shape"][0]) if oaxis >= 0 else
max(map(lambda x: int(x["shape"][0]), info[okey])))
factor = max(int(ilen / max_length_in), int(olen / max_length_out))
# change batchsize depending on the input and output length
# if ilen = 1000 and max_length_in = 800
# then b = batchsize / 2
# and max(min_batches, .) avoids batchsize = 0
bs = max(min_batch_size, int(batch_size / (1 + factor)))
end = min(len(sorted_data), start + bs)
minibatch = sorted_data[start:end]
if shortest_first:
minibatch.reverse()
# check each batch is more than minimum batchsize
if len(minibatch) < min_batch_size:
mod = min_batch_size - len(minibatch) % min_batch_size
additional_minibatch = [
sorted_data[i] for i in np.random.randint(0, start, mod)
]
if shortest_first:
additional_minibatch.reverse()
minibatch.extend(additional_minibatch)
minibatches.append(minibatch)
if end == len(sorted_data):
break
start = end
# batch: List[List[Tuple[str, dict]]]
return minibatches
def batchfy_by_bin(
sorted_data,
batch_bins,
num_batches=0,
min_batch_size=1,
shortest_first=False,
ikey="input",
okey="output", ):
"""Make variably sized batch set, which maximizes
the number of bins up to `batch_bins`.
:param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
:param int batch_bins: Maximum frames of a batch
:param int num_batches: # number of batches to use (for debug)
:param int min_batch_size: minimum batch size (for multi-gpu)
:param int test: Return only every `test` batches
:param bool shortest_first: Sort from batch with shortest samples
to longest if true, otherwise reverse
:param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
:param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
:return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
"""
if batch_bins <= 0:
raise ValueError(f"invalid batch_bins={batch_bins}")
length = len(sorted_data)
idim = int(sorted_data[0][1][ikey][0]["shape"][1])
odim = int(sorted_data[0][1][okey][0]["shape"][1])
logger.info("# utts: " + str(len(sorted_data)))
minibatches = []
start = 0
n = 0
while True:
# Dynamic batch size depending on size of samples
b = 0
next_size = 0
max_olen = 0
while next_size < batch_bins and (start + b) < length:
ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0]) * idim
olen = int(sorted_data[start + b][1][okey][0]["shape"][0]) * odim
if olen > max_olen:
max_olen = olen
next_size = (max_olen + ilen) * (b + 1)
if next_size <= batch_bins:
b += 1
elif next_size == 0:
raise ValueError(
f"Can't fit one sample in batch_bins ({batch_bins}): "
f"Please increase the value")
end = min(length, start + max(min_batch_size, b))
batch = sorted_data[start:end]
if shortest_first:
batch.reverse()
minibatches.append(batch)
# Check for min_batch_size and fixes the batches if needed
i = -1
while len(minibatches[i]) < min_batch_size:
missing = min_batch_size - len(minibatches[i])
if -i == len(minibatches):
minibatches[i + 1].extend(minibatches[i])
minibatches = minibatches[1:]
break
else:
minibatches[i].extend(minibatches[i - 1][:missing])
minibatches[i - 1] = minibatches[i - 1][missing:]
i -= 1
if end == length:
break
start = end
n += 1
if num_batches > 0:
minibatches = minibatches[:num_batches]
lengths = [len(x) for x in minibatches]
logger.info(
str(len(minibatches)) + " batches containing from " + str(min(lengths))
+ " to " + str(max(lengths)) + " samples " + "(avg " + str(
int(np.mean(lengths))) + " samples).")
return minibatches
def batchfy_by_frame(
sorted_data,
max_frames_in,
max_frames_out,
max_frames_inout,
num_batches=0,
min_batch_size=1,
shortest_first=False,
ikey="input",
okey="output", ):
"""Make variable batch set, which maximizes the number of frames to max_batch_frame.
:param List[(str, Dict[str, Any])] sorteddata: dictionary loaded from data.json
:param int max_frames_in: Maximum input frames of a batch
:param int max_frames_out: Maximum output frames of a batch
:param int max_frames_inout: Maximum input+output frames of a batch
:param int num_batches: # number of batches to use (for debug)
:param int min_batch_size: minimum batch size (for multi-gpu)
:param int test: Return only every `test` batches
:param bool shortest_first: Sort from batch with shortest samples
to longest if true, otherwise reverse
:param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
:param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
:return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
"""
if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0:
raise ValueError(
"At least, one of `--batch-frames-in`, `--batch-frames-out` or "
"`--batch-frames-inout` should be > 0")
length = len(sorted_data)
minibatches = []
start = 0
end = 0
while end != length:
# Dynamic batch size depending on size of samples
b = 0
max_olen = 0
max_ilen = 0
while (start + b) < length:
ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0])
if ilen > max_frames_in and max_frames_in != 0:
raise ValueError(
f"Can't fit one sample in --batch-frames-in ({max_frames_in}): "
f"Please increase the value")
olen = int(sorted_data[start + b][1][okey][0]["shape"][0])
if olen > max_frames_out and max_frames_out != 0:
raise ValueError(
f"Can't fit one sample in --batch-frames-out ({max_frames_out}): "
f"Please increase the value")
if ilen + olen > max_frames_inout and max_frames_inout != 0:
raise ValueError(
f"Can't fit one sample in --batch-frames-out ({max_frames_inout}): "
f"Please increase the value")
max_olen = max(max_olen, olen)
max_ilen = max(max_ilen, ilen)
in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0
out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0
inout_ok = (max_ilen + max_olen) * (
b + 1) <= max_frames_inout or max_frames_inout == 0
if in_ok and out_ok and inout_ok:
# add more seq in the minibatch
b += 1
else:
# no more seq in the minibatch
break
end = min(length, start + b)
batch = sorted_data[start:end]
if shortest_first:
batch.reverse()
minibatches.append(batch)
# Check for min_batch_size and fixes the batches if needed
i = -1
while len(minibatches[i]) < min_batch_size:
missing = min_batch_size - len(minibatches[i])
if -i == len(minibatches):
minibatches[i + 1].extend(minibatches[i])
minibatches = minibatches[1:]
break
else:
minibatches[i].extend(minibatches[i - 1][:missing])
minibatches[i - 1] = minibatches[i - 1][missing:]
i -= 1
start = end
if num_batches > 0:
minibatches = minibatches[:num_batches]
lengths = [len(x) for x in minibatches]
logger.info(
str(len(minibatches)) + " batches containing from " + str(min(lengths))
+ " to " + str(max(lengths)) + " samples" + "(avg " + str(
int(np.mean(lengths))) + " samples).")
return minibatches
def batchfy_shuffle(data, batch_size, min_batch_size, num_batches,
shortest_first):
import random
logger.info("use shuffled batch.")
sorted_data = random.sample(data.items(), len(data.items()))
logger.info("# utts: " + str(len(sorted_data)))
# make list of minibatches
minibatches = []
start = 0
while True:
end = min(len(sorted_data), start + batch_size)
# check each batch is more than minimum batchsize
minibatch = sorted_data[start:end]
if shortest_first:
minibatch.reverse()
if len(minibatch) < min_batch_size:
mod = min_batch_size - len(minibatch) % min_batch_size
additional_minibatch = [
sorted_data[i] for i in np.random.randint(0, start, mod)
]
if shortest_first:
additional_minibatch.reverse()
minibatch.extend(additional_minibatch)
minibatches.append(minibatch)
if end == len(sorted_data):
break
start = end
# for debugging
if num_batches > 0:
minibatches = minibatches[:num_batches]
logger.info("# minibatches: " + str(len(minibatches)))
return minibatches
BATCH_COUNT_CHOICES = ["auto", "seq", "bin", "frame"]
BATCH_SORT_KEY_CHOICES = ["input", "output", "shuffle"]
def make_batchset(
data,
batch_size=0,
max_length_in=float("inf"),
max_length_out=float("inf"),
num_batches=0,
min_batch_size=1,
shortest_first=False,
batch_sort_key="input",
count="auto",
batch_bins=0,
batch_frames_in=0,
batch_frames_out=0,
batch_frames_inout=0,
iaxis=0,
oaxis=0, ):
"""Make batch set from json dictionary
if utts have "category" value,
>>> data = [{'category': 'A', 'input': ..., 'utt':'utt1'},
... {'category': 'B', 'input': ..., 'utt':'utt2'},
... {'category': 'B', 'input': ..., 'utt':'utt3'},
... {'category': 'A', 'input': ..., 'utt':'utt4'}]
>>> make_batchset(data, batchsize=2, ...)
[[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]]
Note that if any utts doesn't have "category",
perform as same as batchfy_by_{count}
:param List[Dict[str, Any]] data: dictionary loaded from data.json
:param int batch_size: maximum number of sequences in a minibatch.
:param int batch_bins: maximum number of bins (frames x dim) in a minibatch.
:param int batch_frames_in: maximum number of input frames in a minibatch.
:param int batch_frames_out: maximum number of output frames in a minibatch.
:param int batch_frames_out: maximum number of input+output frames in a minibatch.
:param str count: strategy to count maximum size of batch.
For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES
:param int max_length_in: maximum length of input to decide adaptive batch size
:param int max_length_out: maximum length of output to decide adaptive batch size
:param int num_batches: # number of batches to use (for debug)
:param int min_batch_size: minimum batch size (for multi-gpu)
:param bool shortest_first: Sort from batch with shortest samples
to longest if true, otherwise reverse
:param str batch_sort_key: how to sort data before creating minibatches
["input", "output", "shuffle"]
:param bool swap_io: if True, use "input" as output and "output"
as input in `data` dict
:param bool mt: if True, use 0-axis of "output" as output and 1-axis of "output"
as input in `data` dict
:param int iaxis: dimension to access input
(for ASR, TTS iaxis=0, for MT iaxis="1".)
:param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0,
reserved for future research, -1 means all axis.)
:return: List[List[Tuple[str, dict]]] list of batches
"""
# check args
if count not in BATCH_COUNT_CHOICES:
raise ValueError(
f"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}")
if batch_sort_key not in BATCH_SORT_KEY_CHOICES:
raise ValueError(f"arg 'batch_sort_key' ({batch_sort_key}) should be "
f"one of {BATCH_SORT_KEY_CHOICES}")
ikey = "input"
okey = "output"
batch_sort_axis = 0 # index of list
if count == "auto":
if batch_size != 0:
count = "seq"
elif batch_bins != 0:
count = "bin"
elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0:
count = "frame"
else:
raise ValueError(
f"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}"
)
logger.info(f"count is auto detected as {count}")
if count != "seq" and batch_sort_key == "shuffle":
raise ValueError(
"batch_sort_key=shuffle is only available if batch_count=seq")
category2data = {} # Dict[str, dict]
for v in data:
k = v['utt']
category2data.setdefault(v.get("category"), {})[k] = v
batches_list = [] # List[List[List[Tuple[str, dict]]]]
for d in category2data.values():
if batch_sort_key == "shuffle":
batches = batchfy_shuffle(d, batch_size, min_batch_size,
num_batches, shortest_first)
batches_list.append(batches)
continue
# sort it by input lengths (long to short)
sorted_data = sorted(
d.items(),
key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]),
reverse=not shortest_first, )
logger.info("# utts: " + str(len(sorted_data)))
if count == "seq":
batches = batchfy_by_seq(
sorted_data,
batch_size=batch_size,
max_length_in=max_length_in,
max_length_out=max_length_out,
min_batch_size=min_batch_size,
shortest_first=shortest_first,
ikey=ikey,
iaxis=iaxis,
okey=okey,
oaxis=oaxis, )
if count == "bin":
batches = batchfy_by_bin(
sorted_data,
batch_bins=batch_bins,
min_batch_size=min_batch_size,
shortest_first=shortest_first,
ikey=ikey,
okey=okey, )
if count == "frame":
batches = batchfy_by_frame(
sorted_data,
max_frames_in=batch_frames_in,
max_frames_out=batch_frames_out,
max_frames_inout=batch_frames_inout,
min_batch_size=min_batch_size,
shortest_first=shortest_first,
ikey=ikey,
okey=okey, )
batches_list.append(batches)
if len(batches_list) == 1:
batches = batches_list[0]
else:
# Concat list. This way is faster than "sum(batch_list, [])"
batches = list(itertools.chain(*batches_list))
# for debugging
if num_batches > 0:
batches = batches[:num_batches]
logger.info("# minibatches: " + str(len(batches)))
# batch: List[List[Tuple[str, dict]]]
return batches

@ -23,7 +23,7 @@ from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
from deepspeech.frontend.normalizer import FeatureNormalizer from deepspeech.frontend.normalizer import FeatureNormalizer
from deepspeech.frontend.speech import SpeechSegment from deepspeech.frontend.speech import SpeechSegment
from deepspeech.frontend.utility import IGNORE_ID from deepspeech.frontend.utility import IGNORE_ID
from deepspeech.io.utility import pad_sequence from deepspeech.io.utility import pad_list
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
__all__ = ["SpeechCollator"] __all__ = ["SpeechCollator"]
@ -286,13 +286,12 @@ class SpeechCollator():
texts.append(tokens) texts.append(tokens)
text_lens.append(tokens.shape[0]) text_lens.append(tokens.shape[0])
padded_audios = pad_sequence( #[B, T, D]
audios, padding_value=0.0).astype(np.float32) #[B, T, D] xs_pad = pad_list(audios, 0.0).astype(np.float32)
audio_lens = np.array(audio_lens).astype(np.int64) ilens = np.array(audio_lens).astype(np.int64)
padded_texts = pad_sequence( ys_pad = pad_list(texts, IGNORE_ID).astype(np.int64)
texts, padding_value=IGNORE_ID).astype(np.int64) olens = np.array(text_lens).astype(np.int64)
text_lens = np.array(text_lens).astype(np.int64) return utts, xs_pad, ilens, ys_pad, olens
return utts, padded_audios, audio_lens, padded_texts, text_lens
@property @property
def manifest(self): def manifest(self):

@ -0,0 +1,80 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from deepspeech.io.utility import pad_list
from deepspeech.utils.log import Log
__all__ = ["CustomConverter"]
logger = Log(__name__).getlog()
class CustomConverter():
"""Custom batch converter.
Args:
subsampling_factor (int): The subsampling factor.
dtype (np.dtype): Data type to convert.
"""
def __init__(self, subsampling_factor=1, dtype=np.float32):
"""Construct a CustomConverter object."""
self.subsampling_factor = subsampling_factor
self.ignore_id = -1
self.dtype = dtype
def __call__(self, batch):
"""Transform a batch and send it to a device.
Args:
batch (list): The batch to transform.
Returns:
tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)
"""
# batch should be located in list
assert len(batch) == 1
(xs, ys), utts = batch[0]
# perform subsampling
if self.subsampling_factor > 1:
xs = [x[::self.subsampling_factor, :] for x in xs]
# get batch of lengths of input sequences
ilens = np.array([x.shape[0] for x in xs])
# perform padding and convert to tensor
# currently only support real number
if xs[0].dtype.kind == "c":
xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)
xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)
# Note(kamo):
# {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
# Don't create ComplexTensor and give it E2E here
# because torch.nn.DataParellel can't handle it.
xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
else:
xs_pad = pad_list(xs, 0).astype(self.dtype)
# NOTE: this is for multi-output (e.g., speech translation)
ys_pad = pad_list(
[np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],
self.ignore_id)
olens = np.array(
[y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])
return utts, xs_pad, ilens, ys_pad, olens

@ -0,0 +1,138 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.io import DataLoader
from deepspeech.frontend.utility import read_manifest
from deepspeech.io.batchfy import make_batchset
from deepspeech.io.converter import CustomConverter
from deepspeech.io.dataset import TransformDataset
from deepspeech.io.reader import LoadInputsAndTargets
from deepspeech.utils.log import Log
__all__ = ["BatchDataLoader"]
logger = Log(__name__).getlog()
class BatchDataLoader():
def __init__(self,
json_file: str,
train_mode: bool,
sortagrad: bool=False,
batch_size: int=0,
maxlen_in: float=float('inf'),
maxlen_out: float=float('inf'),
minibatches: int=0,
mini_batch_size: int=1,
batch_count: str='auto',
batch_bins: int=0,
batch_frames_in: int=0,
batch_frames_out: int=0,
batch_frames_inout: int=0,
preprocess_conf=None,
n_iter_processes: int=1,
subsampling_factor: int=1,
num_encs: int=1):
self.json_file = json_file
self.train_mode = train_mode
self.use_sortagrad = sortagrad == -1 or sortagrad > 0
self.batch_size = batch_size
self.maxlen_in = maxlen_in
self.maxlen_out = maxlen_out
self.batch_count = batch_count
self.batch_bins = batch_bins
self.batch_frames_in = batch_frames_in
self.batch_frames_out = batch_frames_out
self.batch_frames_inout = batch_frames_inout
self.subsampling_factor = subsampling_factor
self.num_encs = num_encs
self.preprocess_conf = preprocess_conf
self.n_iter_processes = n_iter_processes
# read json data
self.data_json = read_manifest(json_file)
# make minibatch list (variable length)
self.minibaches = make_batchset(
self.data_json,
batch_size,
maxlen_in,
maxlen_out,
minibatches, # for debug
min_batch_size=mini_batch_size,
shortest_first=self.use_sortagrad,
count=batch_count,
batch_bins=batch_bins,
batch_frames_in=batch_frames_in,
batch_frames_out=batch_frames_out,
batch_frames_inout=batch_frames_inout,
iaxis=0,
oaxis=0, )
# data reader
self.reader = LoadInputsAndTargets(
mode="asr",
load_output=True,
preprocess_conf=preprocess_conf,
preprocess_args={"train":
train_mode}, # Switch the mode of preprocessing
)
# Setup a converter
if num_encs == 1:
self.converter = CustomConverter(
subsampling_factor=subsampling_factor, dtype=np.float32)
else:
assert NotImplementedError("not impl CustomConverterMulEnc.")
# hack to make batchsize argument as 1
# actual bathsize is included in a list
# default collate function converts numpy array to pytorch tensor
# we used an empty collate function instead which returns list
self.dataset = TransformDataset(
self.minibaches,
lambda data: self.converter([self.reader(data, return_uttid=True)]))
self.dataloader = DataLoader(
dataset=self.dataset,
batch_size=1,
shuffle=not use_sortagrad if train_mode else False,
collate_fn=lambda x: x[0],
num_workers=n_iter_processes, )
def __repr__(self):
echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> "
echo += f"train_mode: {self.train_mode}, "
echo += f"sortagrad: {self.use_sortagrad}, "
echo += f"batch_size: {self.batch_size}, "
echo += f"maxlen_in: {self.maxlen_in}, "
echo += f"maxlen_out: {self.maxlen_out}, "
echo += f"batch_count: {self.batch_count}, "
echo += f"batch_bins: {self.batch_bins}, "
echo += f"batch_frames_in: {self.batch_frames_in}, "
echo += f"batch_frames_out: {self.batch_frames_out}, "
echo += f"batch_frames_inout: {self.batch_frames_inout}, "
echo += f"subsampling_factor: {self.subsampling_factor}, "
echo += f"num_encs: {self.num_encs}, "
echo += f"num_workers: {self.n_iter_processes}, "
echo += f"file: {self.json_file}"
return echo
def __len__(self):
return len(self.dataloader)
def __iter__(self):
return self.dataloader.__iter__()
def __call__(self):
return self.__iter__()

@ -19,7 +19,7 @@ from yacs.config import CfgNode
from deepspeech.frontend.utility import read_manifest from deepspeech.frontend.utility import read_manifest
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
__all__ = ["ManifestDataset", "TripletManifestDataset"] __all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -76,12 +76,18 @@ class ManifestDataset(Dataset):
Args: Args:
manifest_path (str): manifest josn file path manifest_path (str): manifest josn file path
max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf'). max_input_len ([type], optional): maximum output seq length,
min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0. in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0. min_input_len (float, optional): minimum input seq length,
min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0. in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0. max_output_len (float, optional): maximum input seq length,
min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05. in modeling units. Defaults to 500.0.
min_output_len (float, optional): minimum input seq length,
in modeling units. Defaults to 0.0.
max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio.
Defaults to 10.0.
min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
Defaults to 0.05.
""" """
super().__init__() super().__init__()
@ -116,3 +122,27 @@ class TripletManifestDataset(ManifestDataset):
instance = self._manifest[idx] instance = self._manifest[idx]
return instance["utt"], instance["feat"], instance["text"], instance[ return instance["utt"], instance["feat"], instance["text"], instance[
"text1"] "text1"]
class TransformDataset(Dataset):
"""Transform Dataset.
Args:
data: list object from make_batchset
transfrom: transform function
"""
def __init__(self, data, transform):
"""Init function."""
super().__init__()
self.data = data
self.transform = transform
def __len__(self):
"""Len function."""
return len(self.data)
def __getitem__(self, idx):
"""[] operator."""
return self.transform(self.data[idx])

@ -0,0 +1,409 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import OrderedDict
import kaldiio
import numpy as np
import soundfile
from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
from deepspeech.utils.log import Log
__all__ = ["LoadInputsAndTargets"]
logger = Log(__name__).getlog()
class LoadInputsAndTargets():
"""Create a mini-batch from a list of dicts
>>> batch = [('utt1',
... dict(input=[dict(feat='some.ark:123',
... filetype='mat',
... name='input1',
... shape=[100, 80])],
... output=[dict(tokenid='1 2 3 4',
... name='target1',
... shape=[4, 31])]]))
>>> l = LoadInputsAndTargets()
>>> feat, target = l(batch)
:param: str mode: Specify the task mode, "asr" or "tts"
:param: str preprocess_conf: The path of a json file for pre-processing
:param: bool load_input: If False, not to load the input data
:param: bool load_output: If False, not to load the output data
:param: bool sort_in_input_length: Sort the mini-batch in descending order
of the input length
:param: bool use_speaker_embedding: Used for tts mode only
:param: bool use_second_target: Used for tts mode only
:param: dict preprocess_args: Set some optional arguments for preprocessing
:param: Optional[dict] preprocess_args: Used for tts mode only
"""
def __init__(
self,
mode="asr",
preprocess_conf=None,
load_input=True,
load_output=True,
sort_in_input_length=True,
preprocess_args=None,
keep_all_data_on_mem=False, ):
self._loaders = {}
if mode not in ["asr"]:
raise ValueError("Only asr are allowed: mode={}".format(mode))
if preprocess_conf is not None:
self.preprocessing = AugmentationPipeline(preprocess_conf)
logging.warning(
"[Experimental feature] Some preprocessing will be done "
"for the mini-batch creation using {}".format(
self.preprocessing))
else:
# If conf doesn't exist, this function don't touch anything.
self.preprocessing = None
self.mode = mode
self.load_output = load_output
self.load_input = load_input
self.sort_in_input_length = sort_in_input_length
if preprocess_args is None:
self.preprocess_args = {}
else:
assert isinstance(preprocess_args, dict), type(preprocess_args)
self.preprocess_args = dict(preprocess_args)
self.keep_all_data_on_mem = keep_all_data_on_mem
def __call__(self, batch, return_uttid=False):
"""Function to load inputs and targets from list of dicts
:param List[Tuple[str, dict]] batch: list of dict which is subset of
loaded data.json
:param bool return_uttid: return utterance ID information for visualization
:return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]
:return: list of input feature sequences
[(T_1, D), (T_2, D), ..., (T_B, D)]
:rtype: list of float ndarray
:return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]
:rtype: list of int ndarray
"""
x_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]]
y_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]]
uttid_list = [] # List[str]
for uttid, info in batch:
uttid_list.append(uttid)
if self.load_input:
# Note(kamo): This for-loop is for multiple inputs
for idx, inp in enumerate(info["input"]):
# {"input":
# [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "hdf5",
# "name": "input1", ...}], ...}
x = self._get_from_loader(
filepath=inp["feat"],
filetype=inp.get("filetype", "mat"))
x_feats_dict.setdefault(inp["name"], []).append(x)
if self.load_output:
for idx, inp in enumerate(info["output"]):
if "tokenid" in inp:
# ======= Legacy format for output =======
# {"output": [{"tokenid": "1 2 3 4"}])
x = np.fromiter(
map(int, inp["tokenid"].split()), dtype=np.int64)
else:
# ======= New format =======
# {"input":
# [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "hdf5",
# "name": "target1", ...}], ...}
x = self._get_from_loader(
filepath=inp["feat"],
filetype=inp.get("filetype", "mat"))
y_feats_dict.setdefault(inp["name"], []).append(x)
if self.mode == "asr":
return_batch, uttid_list = self._create_batch_asr(
x_feats_dict, y_feats_dict, uttid_list)
else:
raise NotImplementedError(self.mode)
if self.preprocessing is not None:
# Apply pre-processing all input features
for x_name in return_batch.keys():
if x_name.startswith("input"):
return_batch[x_name] = self.preprocessing(
return_batch[x_name], uttid_list,
**self.preprocess_args)
if return_uttid:
return tuple(return_batch.values()), uttid_list
# Doesn't return the names now.
return tuple(return_batch.values())
def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):
"""Create a OrderedDict for the mini-batch
:param OrderedDict x_feats_dict:
e.g. {"input1": [ndarray, ndarray, ...],
"input2": [ndarray, ndarray, ...]}
:param OrderedDict y_feats_dict:
e.g. {"target1": [ndarray, ndarray, ...],
"target2": [ndarray, ndarray, ...]}
:param: List[str] uttid_list:
Give uttid_list to sort in the same order as the mini-batch
:return: batch, uttid_list
:rtype: Tuple[OrderedDict, List[str]]
"""
# handle single-input and multi-input (paralell) asr mode
xs = list(x_feats_dict.values())
if self.load_output:
ys = list(y_feats_dict.values())
assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0]))
# get index of non-zero length samples
nonzero_idx = list(
filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0]))))
for n in range(1, len(y_feats_dict)):
nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx)
else:
# Note(kamo): Be careful not to make nonzero_idx to a generator
nonzero_idx = list(range(len(xs[0])))
if self.sort_in_input_length:
# sort in input lengths based on the first input
nonzero_sorted_idx = sorted(
nonzero_idx, key=lambda i: -len(xs[0][i]))
else:
nonzero_sorted_idx = nonzero_idx
if len(nonzero_sorted_idx) != len(xs[0]):
logging.warning(
"Target sequences include empty tokenid (batch {} -> {}).".
format(len(xs[0]), len(nonzero_sorted_idx)))
# remove zero-length samples
xs = [[x[i] for i in nonzero_sorted_idx] for x in xs]
uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]
x_names = list(x_feats_dict.keys())
if self.load_output:
ys = [[y[i] for i in nonzero_sorted_idx] for y in ys]
y_names = list(y_feats_dict.keys())
# Keeping x_name and y_name, e.g. input1, for future extension
return_batch = OrderedDict([
* [(x_name, x) for x_name, x in zip(x_names, xs)],
* [(y_name, y) for y_name, y in zip(y_names, ys)],
])
else:
return_batch = OrderedDict(
[(x_name, x) for x_name, x in zip(x_names, xs)])
return return_batch, uttid_list
def _get_from_loader(self, filepath, filetype):
"""Return ndarray
In order to make the fds to be opened only at the first referring,
the loader are stored in self._loaders
>>> ndarray = loader.get_from_loader(
... 'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')
:param: str filepath:
:param: str filetype:
:return:
:rtype: np.ndarray
"""
if filetype == "hdf5":
# e.g.
# {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "hdf5",
# -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
filepath, key = filepath.split(":", 1)
loader = self._loaders.get(filepath)
if loader is None:
# To avoid disk access, create loader only for the first time
loader = h5py.File(filepath, "r")
self._loaders[filepath] = loader
return loader[key][()]
elif filetype == "sound.hdf5":
# e.g.
# {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "sound.hdf5",
# -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
filepath, key = filepath.split(":", 1)
loader = self._loaders.get(filepath)
if loader is None:
# To avoid disk access, create loader only for the first time
loader = SoundHDF5File(filepath, "r", dtype="int16")
self._loaders[filepath] = loader
array, rate = loader[key]
return array
elif filetype == "sound":
# e.g.
# {"input": [{"feat": "some/path.wav",
# "filetype": "sound"},
# Assume PCM16
if not self.keep_all_data_on_mem:
array, _ = soundfile.read(filepath, dtype="int16")
return array
if filepath not in self._loaders:
array, _ = soundfile.read(filepath, dtype="int16")
self._loaders[filepath] = array
return self._loaders[filepath]
elif filetype == "npz":
# e.g.
# {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL",
# "filetype": "npz",
filepath, key = filepath.split(":", 1)
loader = self._loaders.get(filepath)
if loader is None:
# To avoid disk access, create loader only for the first time
loader = np.load(filepath)
self._loaders[filepath] = loader
return loader[key]
elif filetype == "npy":
# e.g.
# {"input": [{"feat": "some/path.npy",
# "filetype": "npy"},
if not self.keep_all_data_on_mem:
return np.load(filepath)
if filepath not in self._loaders:
self._loaders[filepath] = np.load(filepath)
return self._loaders[filepath]
elif filetype in ["mat", "vec"]:
# e.g.
# {"input": [{"feat": "some/path.ark:123",
# "filetype": "mat"}]},
# In this case, "123" indicates the starting points of the matrix
# load_mat can load both matrix and vector
if not self.keep_all_data_on_mem:
return kaldiio.load_mat(filepath)
if filepath not in self._loaders:
self._loaders[filepath] = kaldiio.load_mat(filepath)
return self._loaders[filepath]
elif filetype == "scp":
# e.g.
# {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL",
# "filetype": "scp",
filepath, key = filepath.split(":", 1)
loader = self._loaders.get(filepath)
if loader is None:
# To avoid disk access, create loader only for the first time
loader = kaldiio.load_scp(filepath)
self._loaders[filepath] = loader
return loader[key]
else:
raise NotImplementedError(
"Not supported: loader_type={}".format(filetype))
class SoundHDF5File():
"""Collecting sound files to a HDF5 file
>>> f = SoundHDF5File('a.flac.h5', mode='a')
>>> array = np.random.randint(0, 100, 100, dtype=np.int16)
>>> f['id'] = (array, 16000)
>>> array, rate = f['id']
:param: str filepath:
:param: str mode:
:param: str format: The type used when saving wav. flac, nist, htk, etc.
:param: str dtype:
"""
def __init__(self,
filepath,
mode="r+",
format=None,
dtype="int16",
**kwargs):
self.filepath = filepath
self.mode = mode
self.dtype = dtype
self.file = h5py.File(filepath, mode, **kwargs)
if format is None:
# filepath = a.flac.h5 -> format = flac
second_ext = os.path.splitext(os.path.splitext(filepath)[0])[1]
format = second_ext[1:]
if format.upper() not in soundfile.available_formats():
# If not found, flac is selected
format = "flac"
# This format affects only saving
self.format = format
def __repr__(self):
return '<SoundHDF5 file "{}" (mode {}, format {}, type {})>'.format(
self.filepath, self.mode, self.format, self.dtype)
def create_dataset(self, name, shape=None, data=None, **kwds):
f = io.BytesIO()
array, rate = data
soundfile.write(f, array, rate, format=self.format)
self.file.create_dataset(
name, shape=shape, data=np.void(f.getvalue()), **kwds)
def __setitem__(self, name, data):
self.create_dataset(name, data=data)
def __getitem__(self, key):
data = self.file[key][()]
f = io.BytesIO(data.tobytes())
array, rate = soundfile.read(f, dtype=self.dtype)
return array, rate
def keys(self):
return self.file.keys()
def values(self):
for k in self.file:
yield self[k]
def items(self):
for k in self.file:
yield k, self[k]
def __iter__(self):
return iter(self.file)
def __contains__(self, item):
return item in self.file
def __len__(self, item):
return len(self.file)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.file.close()
def close(self):
self.file.close()

@ -17,11 +17,16 @@ import numpy as np
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
__all__ = ["pad_sequence"] __all__ = ["pad_list", "pad_sequence"]
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
def pad_list(sequences: List[np.ndarray],
padding_value: float=0.0) -> np.ndarray:
return pad_sequence(sequences, True, padding_value)
def pad_sequence(sequences: List[np.ndarray], def pad_sequence(sequences: List[np.ndarray],
batch_first: bool=True, batch_first: bool=True,
padding_value: float=0.0) -> np.ndarray: padding_value: float=0.0) -> np.ndarray:

@ -297,7 +297,7 @@ class RNNStack(nn.Layer):
share_weights=share_rnn_weights)) share_weights=share_rnn_weights))
i_size = h_size * 2 i_size = h_size * 2
self.rnn_stacks = nn.ModuleList(rnn_stacks) self.rnn_stacks = nn.LayerList(rnn_stacks)
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
""" """

@ -54,7 +54,7 @@ __all__ = ["U2Model", "U2InferModel"]
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
class U2BaseModel(nn.Module): class U2BaseModel(nn.Layer):
"""CTC-Attention hybrid Encoder-Decoder model""" """CTC-Attention hybrid Encoder-Decoder model"""
@classmethod @classmethod

@ -48,7 +48,7 @@ __all__ = ["U2STModel", "U2STInferModel"]
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
class U2STBaseModel(nn.Module): class U2STBaseModel(nn.Layer):
"""CTC-Attention hybrid Encoder-Decoder model""" """CTC-Attention hybrid Encoder-Decoder model"""
@classmethod @classmethod

@ -33,7 +33,7 @@ logger = Log(__name__).getlog()
__all__ = ["TransformerDecoder"] __all__ = ["TransformerDecoder"]
class TransformerDecoder(nn.Module): class TransformerDecoder(nn.Layer):
"""Base class of Transfomer decoder module. """Base class of Transfomer decoder module.
Args: Args:
vocab_size: output dim vocab_size: output dim
@ -86,7 +86,7 @@ class TransformerDecoder(nn.Module):
self.use_output_layer = use_output_layer self.use_output_layer = use_output_layer
self.output_layer = nn.Linear(attention_dim, vocab_size) self.output_layer = nn.Linear(attention_dim, vocab_size)
self.decoders = nn.ModuleList([ self.decoders = nn.LayerList([
DecoderLayer( DecoderLayer(
size=attention_dim, size=attention_dim,
self_attn=MultiHeadedAttention(attention_heads, attention_dim, self_attn=MultiHeadedAttention(attention_heads, attention_dim,

@ -25,15 +25,15 @@ logger = Log(__name__).getlog()
__all__ = ["DecoderLayer"] __all__ = ["DecoderLayer"]
class DecoderLayer(nn.Module): class DecoderLayer(nn.Layer):
"""Single decoder layer module. """Single decoder layer module.
Args: Args:
size (int): Input dimension. size (int): Input dimension.
self_attn (nn.Module): Self-attention module instance. self_attn (nn.Layer): Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument. `MultiHeadedAttention` instance can be used as the argument.
src_attn (nn.Module): Self-attention module instance. src_attn (nn.Layer): Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument. `MultiHeadedAttention` instance can be used as the argument.
feed_forward (nn.Module): Feed-forward module instance. feed_forward (nn.Layer): Feed-forward module instance.
`PositionwiseFeedForward` instance can be used as the argument. `PositionwiseFeedForward` instance can be used as the argument.
dropout_rate (float): Dropout rate. dropout_rate (float): Dropout rate.
normalize_before (bool): normalize_before (bool):
@ -48,9 +48,9 @@ class DecoderLayer(nn.Module):
def __init__( def __init__(
self, self,
size: int, size: int,
self_attn: nn.Module, self_attn: nn.Layer,
src_attn: nn.Module, src_attn: nn.Layer,
feed_forward: nn.Module, feed_forward: nn.Layer,
dropout_rate: float, dropout_rate: float,
normalize_before: bool=True, normalize_before: bool=True,
concat_after: bool=False, ): concat_after: bool=False, ):

@ -358,7 +358,7 @@ class TransformerEncoder(BaseEncoder):
pos_enc_layer_type, normalize_before, concat_after, pos_enc_layer_type, normalize_before, concat_after,
static_chunk_size, use_dynamic_chunk, global_cmvn, static_chunk_size, use_dynamic_chunk, global_cmvn,
use_dynamic_left_chunk) use_dynamic_left_chunk)
self.encoders = nn.ModuleList([ self.encoders = nn.LayerList([
TransformerEncoderLayer( TransformerEncoderLayer(
size=output_size, size=output_size,
self_attn=MultiHeadedAttention(attention_heads, output_size, self_attn=MultiHeadedAttention(attention_heads, output_size,
@ -438,7 +438,7 @@ class ConformerEncoder(BaseEncoder):
convolution_layer_args = (output_size, cnn_module_kernel, activation, convolution_layer_args = (output_size, cnn_module_kernel, activation,
cnn_module_norm, causal) cnn_module_norm, causal)
self.encoders = nn.ModuleList([ self.encoders = nn.LayerList([
ConformerEncoderLayer( ConformerEncoderLayer(
size=output_size, size=output_size,
self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args), self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),

@ -48,7 +48,8 @@ class CTCLoss(nn.Layer):
logits = logits.transpose([1, 0, 2]) logits = logits.transpose([1, 0, 2])
# (TODO:Hui Zhang) ctc loss does not support int64 labels # (TODO:Hui Zhang) ctc loss does not support int64 labels
ys_pad = ys_pad.astype(paddle.int32) ys_pad = ys_pad.astype(paddle.int32)
loss = self.loss(logits, ys_pad, hlens, ys_lens) loss = self.loss(
logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average)
if self.batch_average: if self.batch_average:
# Batch-size average # Batch-size average
loss = loss / B loss = loss / B

@ -297,7 +297,7 @@ class RNNStack(nn.Layer):
share_weights=share_rnn_weights)) share_weights=share_rnn_weights))
i_size = h_size * 2 i_size = h_size * 2
self.rnn_stacks = nn.ModuleList(rnn_stacks) self.rnn_stacks = nn.LayerList(rnn_stacks)
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
""" """

@ -21,7 +21,6 @@
| --- | --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | 6.35 | 0.057117 | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | 6.35 | 0.057117 |
## Chunk Conformer ## Chunk Conformer
| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER | | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- | --- |
@ -39,4 +38,7 @@
### Test w/o length filter ### Test w/o length filter
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 6.98 | 0.066500 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 7.63 | 0.056832 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | ctc_greedy_search | 7.63 | 0.059742 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | ctc_prefix_beam_search | 7.63 | 0.059057 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention_rescoring | 7.63 | 0.047417 |

@ -4,7 +4,7 @@ data:
dev_manifest: data/manifest.dev dev_manifest: data/manifest.dev
test_manifest: data/manifest.test-clean test_manifest: data/manifest.test-clean
min_input_len: 0.5 # second min_input_len: 0.5 # second
max_input_len: 20.0 # second max_input_len: 30.0 # second
min_output_len: 0.0 # tokens min_output_len: 0.0 # tokens
max_output_len: 400.0 # tokens max_output_len: 400.0 # tokens
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05

@ -5,7 +5,7 @@ source path.sh
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/transformer.yaml conf_path=conf/transformer.yaml
avg_num=30 avg_num=5
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num} avg_ckpt=avg_${avg_num}

@ -0,0 +1,36 @@
#!/usr/bin/env python
import argparse
import json
def main(args):
with open(args.json_file, 'r') as fin:
data_json = json.load(fin)
# manifest format:
# {"input": [
# {"feat": "dev/deltafalse/feats.1.ark:842920", "name": "input1", "shape": [349, 83]}
# ],
# "output": [
# {"name": "target1", "shape": [12, 5002], "text": "NO APOLLO", "token": "▁NO ▁A PO LL O", "tokenid": "3144 482 352 269 317"}
# ],
# "utt2spk": "116-288045",
# "utt": "116-288045-0019"}
with open(args.manifest_file, 'w') as fout:
for key, value in data_json['utts'].items():
value['utt'] = key
fout.write(json.dumps(value, ensure_ascii=False))
fout.write("\n")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--json-file', type=str, default=None, help="espnet data json file.")
parser.add_argument(
'--manifest-file',
type=str,
default='maniefst.train',
help='manifest data json line file.')
args = parser.parse_args()
main(args)

@ -5,7 +5,7 @@ source path.sh
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/transformer.yaml conf_path=conf/transformer.yaml
avg_num=30 avg_num=5
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num} avg_ckpt=avg_${avg_num}

@ -1,5 +1,6 @@
coverage coverage
gpustat gpustat
kaldiio
pre-commit pre-commit
pybind11 pybind11
resampy==0.2.2 resampy==0.2.2

Loading…
Cancel
Save