PaddleSpeech/deepspeech/io/utility.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List

import numpy as np

from deepspeech.utils.log import Log

__all__ = ["pad_sequence"]

logger = Log(__name__).getlog()


def pad_sequence(sequences: List[np.ndarray],
                 batch_first: bool=True,
                 padding_value: float=0.0) -> np.ndarray:
    r"""Pad a list of variable length Tensors with ``padding_value``

    ``pad_sequence`` stacks a list of Tensors along a new dimension,
    and pads them to equal length. For example, if the input is list of
    sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
    otherwise.

    `B` is batch size. It is equal to the number of elements in ``sequences``.
    `T` is length of the longest sequence.
    `L` is length of the sequence.
    `*` is any number of trailing dimensions, including none.

    Example:
        >>> a = np.ones([25, 300])
        >>> b = np.ones([22, 300])
        >>> c = np.ones([15, 300])
        >>> pad_sequence([a, b, c]).shape
        [25, 3, 300]

    Note:
        This function returns a np.ndarray of size ``T x B x *`` or ``B x T x *``
        where `T` is the length of the longest sequence. This function assumes
        trailing dimensions and type of all the Tensors in sequences are same.

    Args:
        sequences (list[np.ndarray]): list of variable length sequences.
        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
            ``T x B x *`` otherwise
        padding_value (float, optional): value for padded elements. Default: 0.

    Returns:
        np.ndarray of size ``T x B x *`` if :attr:`batch_first` is ``False``.
        np.ndarray of size ``B x T x *`` otherwise
    """

    # assuming trailing dimensions and type of all the Tensors
    # in sequences are same and fetching those from sequences[0]
    max_size = sequences[0].shape
    trailing_dims = max_size[1:]
    max_len = max([s.shape[0] for s in sequences])
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
    else:
        out_dims = (max_len, len(sequences)) + trailing_dims

    out_tensor = np.full(out_dims, padding_value, dtype=sequences[0].dtype)
    for i, tensor in enumerate(sequences):
        length = tensor.shape[0]
        # use index notation to prevent duplicate references to the tensor
        if batch_first:
            out_tensor[i, :length, ...] = tensor
        else:
            out_tensor[:length, i, ...] = tensor

    return out_tensor
E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code 4 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`from typing import List`

			`import numpy as np`

			`from deepspeech.utils.log import Log`

			`__all__ = ["pad_sequence"]`

			`logger = Log(__name__).getlog()`


			`def pad_sequence(sequences: List[np.ndarray],`
			`batch_first: bool=True,`
			`padding_value: float=0.0) -> np.ndarray:`
			r"""Pad a list of variable length Tensors with ``padding_value``

			``pad_sequence`` stacks a list of Tensors along a new dimension,
			`and pads them to equal length. For example, if the input is list of`
			sequences with size ``L x `` and if batch_first is False, and ``T x B x ``
			`otherwise.`

			`B` is batch size. It is equal to the number of elements in ``sequences``.
			`T` is length of the longest sequence.
			`L` is length of the sequence.
			`*` is any number of trailing dimensions, including none.

			`Example:`
			`>>> a = np.ones([25, 300])`
			`>>> b = np.ones([22, 300])`
			`>>> c = np.ones([15, 300])`
			`>>> pad_sequence([a, b, c]).shape`
			`[25, 3, 300]`

			`Note:`
			This function returns a np.ndarray of size ``T x B x `` or ``B x T x ``
			where `T` is the length of the longest sequence. This function assumes
			`trailing dimensions and type of all the Tensors in sequences are same.`

			`Args:`
			`sequences (list[np.ndarray]): list of variable length sequences.`
			batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
			``T x B x *`` otherwise
			`padding_value (float, optional): value for padded elements. Default: 0.`

			`Returns:`
			np.ndarray of size ``T x B x *`` if :attr:`batch_first` is ``False``.
			np.ndarray of size ``B x T x *`` otherwise
			`"""`

			`# assuming trailing dimensions and type of all the Tensors`
			`# in sequences are same and fetching those from sequences[0]`
			`max_size = sequences[0].shape`
			`trailing_dims = max_size[1:]`
			`max_len = max([s.shape[0] for s in sequences])`
			`if batch_first:`
			`out_dims = (len(sequences), max_len) + trailing_dims`
			`else:`
			`out_dims = (max_len, len(sequences)) + trailing_dims`

			`out_tensor = np.full(out_dims, padding_value, dtype=sequences[0].dtype)`
			`for i, tensor in enumerate(sequences):`
			`length = tensor.shape[0]`
			`# use index notation to prevent duplicate references to the tensor`
			`if batch_first:`
			`out_tensor[i, :length, ...] = tensor`
			`else:`
			`out_tensor[:length, i, ...] = tensor`

			`return out_tensor`