PaddleSpeech/paddlespeech/s2t/io/dataloader.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any
from typing import Dict
from typing import List
from typing import Text

import jsonlines
import numpy as np
from paddle.io import BatchSampler
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler

from paddlespeech.s2t.io.batchfy import make_batchset
from paddlespeech.s2t.io.converter import CustomConverter
from paddlespeech.s2t.io.dataset import TransformDataset
from paddlespeech.s2t.io.reader import LoadInputsAndTargets
from paddlespeech.s2t.utils.log import Log

__all__ = ["BatchDataLoader"]

logger = Log(__name__).getlog()


def feat_dim_and_vocab_size(data_json: List[Dict[Text, Any]],
                            mode: Text="asr",
                            iaxis=0,
                            oaxis=0):
    if mode == 'asr':
        feat_dim = data_json[0]['input'][oaxis]['shape'][1]
        vocab_size = data_json[0]['output'][oaxis]['shape'][1]
    else:
        raise ValueError(f"{mode} mode not support!")
    return feat_dim, vocab_size


def batch_collate(x):
    """de-minibatch, since user compose batch.

    Args:
        x (List[Tuple]): [(utts, xs, ilens, ys, olens)]

    Returns:
        Tuple: (utts, xs, ilens, ys, olens)
    """
    return x[0]


class BatchDataLoader():
    def __init__(self,
                 json_file: str,
                 train_mode: bool,
                 sortagrad: int=0,
                 batch_size: int=0,
                 maxlen_in: float=float('inf'),
                 maxlen_out: float=float('inf'),
                 minibatches: int=0,
                 mini_batch_size: int=1,
                 batch_count: str='auto',
                 batch_bins: int=0,
                 batch_frames_in: int=0,
                 batch_frames_out: int=0,
                 batch_frames_inout: int=0,
                 preprocess_conf=None,
                 n_iter_processes: int=1,
                 subsampling_factor: int=1,
                 load_aux_input: bool=False,
                 load_aux_output: bool=False,
                 num_encs: int=1,
                 dist_sampler: bool=False,
                 shortest_first: bool=False):
        self.json_file = json_file
        self.train_mode = train_mode
        self.use_sortagrad = sortagrad == -1 or sortagrad > 0
        self.batch_size = batch_size
        self.maxlen_in = maxlen_in
        self.maxlen_out = maxlen_out
        self.batch_count = batch_count
        self.batch_bins = batch_bins
        self.batch_frames_in = batch_frames_in
        self.batch_frames_out = batch_frames_out
        self.batch_frames_inout = batch_frames_inout
        self.subsampling_factor = subsampling_factor
        self.num_encs = num_encs
        self.preprocess_conf = preprocess_conf
        self.n_iter_processes = n_iter_processes
        self.load_aux_input = load_aux_input
        self.load_aux_output = load_aux_output
        self.dist_sampler = dist_sampler
        self.shortest_first = shortest_first

        # read json data
        with jsonlines.open(json_file, 'r') as reader:
            self.data_json = list(reader)

        self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
            self.data_json, mode='asr')

        # make minibatch list (variable length)
        self.minibaches = make_batchset(
            self.data_json,
            batch_size,
            maxlen_in,
            maxlen_out,
            minibatches,  # for debug
            min_batch_size=mini_batch_size,
            shortest_first=self.shortest_first or self.use_sortagrad,
            count=batch_count,
            batch_bins=batch_bins,
            batch_frames_in=batch_frames_in,
            batch_frames_out=batch_frames_out,
            batch_frames_inout=batch_frames_inout,
            iaxis=0,
            oaxis=0, )

        # data reader
        self.reader = LoadInputsAndTargets(
            mode="asr",
            load_output=True,
            preprocess_conf=preprocess_conf,
            preprocess_args={"train":
                             train_mode},  # Switch the mode of preprocessing
        )

        # Setup a converter
        if num_encs == 1:
            self.converter = CustomConverter(
                subsampling_factor=subsampling_factor,
                dtype=np.float32,
                load_aux_input=load_aux_input,
                load_aux_output=load_aux_output)
        else:
            assert NotImplementedError("not impl CustomConverterMulEnc.")

        # hack to make batchsize argument as 1
        # actual bathsize is included in a list
        # default collate function converts numpy array to paddle tensor
        # we used an empty collate function instead which returns list
        self.dataset = TransformDataset(self.minibaches, self.converter,
                                        self.reader)

        if self.dist_sampler:
            self.batch_sampler = DistributedBatchSampler(
                dataset=self.dataset,
                batch_size=1,
                shuffle=not self.use_sortagrad if self.train_mode else False,
                drop_last=False, )
        else:
            self.batch_sampler = BatchSampler(
                dataset=self.dataset,
                batch_size=1,
                shuffle=not self.use_sortagrad if self.train_mode else False,
                drop_last=False, )

        self.dataloader = DataLoader(
            dataset=self.dataset,
            batch_sampler=self.batch_sampler,
            collate_fn=batch_collate,
            num_workers=self.n_iter_processes, )

    def __len__(self):
        return len(self.dataloader)

    def __iter__(self):
        return self.dataloader.__iter__()

    def __call__(self):
        return self.__iter__()

    def __repr__(self):
        echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> "
        echo += f"train_mode: {self.train_mode}, "
        echo += f"sortagrad: {self.use_sortagrad}, "
        echo += f"batch_size: {self.batch_size}, "
        echo += f"maxlen_in: {self.maxlen_in}, "
        echo += f"maxlen_out: {self.maxlen_out}, "
        echo += f"batch_count: {self.batch_count}, "
        echo += f"batch_bins: {self.batch_bins}, "
        echo += f"batch_frames_in: {self.batch_frames_in}, "
        echo += f"batch_frames_out: {self.batch_frames_out}, "
        echo += f"batch_frames_inout: {self.batch_frames_inout}, "
        echo += f"subsampling_factor: {self.subsampling_factor}, "
        echo += f"num_encs: {self.num_encs}, "
        echo += f"num_workers: {self.n_iter_processes}, "
        echo += f"load_aux_input: {self.load_aux_input}, "
        echo += f"load_aux_output: {self.load_aux_output}, "
        echo += f"dist_sampler: {self.dist_sampler}, "
        echo += f"shortest_first: {self.shortest_first}, "
        echo += f"file: {self.json_file}"
        return echo
more data utils 3 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
fix for kaldi 3 years ago			`from typing import Any`
			`from typing import Dict`
			`from typing import List`
			`from typing import Text`

format code 3 years ago			`import jsonlines`
fix for kaldi 3 years ago			`import numpy as np`
[s2t] DataLoader with BatchSampler or DistributeBatchSampler (#1242) * batchsampler or distributebatchsampler * format 3 years ago			`from paddle.io import BatchSampler`
more data utils 3 years ago			`from paddle.io import DataLoader`
[st] Distributed sampler and new dataloader with MIMO (#1239) * update timit result, test=doc_fix * result update * fix bug * add triplet loader * empty preprocess file * sync to u2, updating * sync to u2 config * fix bugs * code refine * update config * customize decoding batch size * update optimizer and lr scheduler * minor * minor * minor * fix bugs of refs * minor * distributed sampler * minor * refine the loader 3 years ago			`from paddle.io import DistributedBatchSampler`
more data utils 3 years ago
merge deepspeech, parakeet and text_processing into paddlespeech 3 years ago			`from paddlespeech.s2t.io.batchfy import make_batchset`
			`from paddlespeech.s2t.io.converter import CustomConverter`
			`from paddlespeech.s2t.io.dataset import TransformDataset`
			`from paddlespeech.s2t.io.reader import LoadInputsAndTargets`
			`from paddlespeech.s2t.utils.log import Log`
more data utils 3 years ago
refactor io 3 years ago			`__all__ = ["BatchDataLoader"]`
more data utils 3 years ago
			`logger = Log(__name__).getlog()`


fix for kaldi 3 years ago			`def feat_dim_and_vocab_size(data_json: List[Dict[Text, Any]],`
			`mode: Text="asr",`
			`iaxis=0,`
			`oaxis=0):`
			`if mode == 'asr':`
			`feat_dim = data_json[0]['input'][oaxis]['shape'][1]`
			`vocab_size = data_json[0]['output'][oaxis]['shape'][1]`
			`else:`
			`raise ValueError(f"{mode} mode not support!")`
			`return feat_dim, vocab_size`


fix dataloader pickle bugs 3 years ago			`def batch_collate(x):`
relase librispeech audio max len to 30 second 3 years ago			`"""de-minibatch, since user compose batch.`
fix dataloader pickle bugs 3 years ago
			`Args:`
			`x (List[Tuple]): [(utts, xs, ilens, ys, olens)]`

			`Returns:`
			`Tuple: (utts, xs, ilens, ys, olens)`
			`"""`
			`return x[0]`


more data utils 3 years ago			`class BatchDataLoader():`
			`def __init__(self,`
			`json_file: str,`
			`train_mode: bool,`
fix sortagrad, test=asr 3 years ago			`sortagrad: int=0,`
more data utils 3 years ago			`batch_size: int=0,`
			`maxlen_in: float=float('inf'),`
			`maxlen_out: float=float('inf'),`
			`minibatches: int=0,`
			`mini_batch_size: int=1,`
			`batch_count: str='auto',`
			`batch_bins: int=0,`
			`batch_frames_in: int=0,`
			`batch_frames_out: int=0,`
			`batch_frames_inout: int=0,`
			`preprocess_conf=None,`
			`n_iter_processes: int=1,`
			`subsampling_factor: int=1,`
[st] Distributed sampler and new dataloader with MIMO (#1239) * update timit result, test=doc_fix * result update * fix bug * add triplet loader * empty preprocess file * sync to u2, updating * sync to u2 config * fix bugs * code refine * update config * customize decoding batch size * update optimizer and lr scheduler * minor * minor * minor * fix bugs of refs * minor * distributed sampler * minor * refine the loader 3 years ago			`load_aux_input: bool=False,`
			`load_aux_output: bool=False,`
[s2t] DataLoader with BatchSampler or DistributeBatchSampler (#1242) * batchsampler or distributebatchsampler * format 3 years ago			`num_encs: int=1,`
fix batch sampler set_epoch when epcoh start 3 years ago			`dist_sampler: bool=False,`
			`shortest_first: bool=False):`
more data utils 3 years ago			`self.json_file = json_file`
			`self.train_mode = train_mode`
			`self.use_sortagrad = sortagrad == -1 or sortagrad > 0`
			`self.batch_size = batch_size`
			`self.maxlen_in = maxlen_in`
			`self.maxlen_out = maxlen_out`
			`self.batch_count = batch_count`
			`self.batch_bins = batch_bins`
			`self.batch_frames_in = batch_frames_in`
			`self.batch_frames_out = batch_frames_out`
			`self.batch_frames_inout = batch_frames_inout`
			`self.subsampling_factor = subsampling_factor`
			`self.num_encs = num_encs`
			`self.preprocess_conf = preprocess_conf`
			`self.n_iter_processes = n_iter_processes`
[st] Distributed sampler and new dataloader with MIMO (#1239) * update timit result, test=doc_fix * result update * fix bug * add triplet loader * empty preprocess file * sync to u2, updating * sync to u2 config * fix bugs * code refine * update config * customize decoding batch size * update optimizer and lr scheduler * minor * minor * minor * fix bugs of refs * minor * distributed sampler * minor * refine the loader 3 years ago			`self.load_aux_input = load_aux_input`
			`self.load_aux_output = load_aux_output`
[s2t] DataLoader with BatchSampler or DistributeBatchSampler (#1242) * batchsampler or distributebatchsampler * format 3 years ago			`self.dist_sampler = dist_sampler`
fix batch sampler set_epoch when epcoh start 3 years ago			`self.shortest_first = shortest_first`
more data utils 3 years ago
			`# read json data`
jsonlines reade manifest file 3 years ago			`with jsonlines.open(json_file, 'r') as reader:`
			`self.data_json = list(reader)`
format code 3 years ago
fix for kaldi 3 years ago			`self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(`
			`self.data_json, mode='asr')`
more data utils 3 years ago
			`# make minibatch list (variable length)`
add dataloader; check augmenter base class type 3 years ago			`self.minibaches = make_batchset(`
			`self.data_json,`
more data utils 3 years ago			`batch_size,`
			`maxlen_in,`
			`maxlen_out,`
			`minibatches, # for debug`
			`min_batch_size=mini_batch_size,`
fix batch sampler set_epoch when epcoh start 3 years ago			`shortest_first=self.shortest_first or self.use_sortagrad,`
more data utils 3 years ago			`count=batch_count,`
			`batch_bins=batch_bins,`
			`batch_frames_in=batch_frames_in,`
			`batch_frames_out=batch_frames_out,`
			`batch_frames_inout=batch_frames_inout,`
			`iaxis=0,`
			`oaxis=0, )`

add dataloader; check augmenter base class type 3 years ago			`# data reader`
			`self.reader = LoadInputsAndTargets(`
more data utils 3 years ago			`mode="asr",`
			`load_output=True,`
			`preprocess_conf=preprocess_conf,`
			`preprocess_args={"train":`
			`train_mode}, # Switch the mode of preprocessing`
			`)`

			`# Setup a converter`
			`if num_encs == 1:`
			`self.converter = CustomConverter(`
[st] Distributed sampler and new dataloader with MIMO (#1239) * update timit result, test=doc_fix * result update * fix bug * add triplet loader * empty preprocess file * sync to u2, updating * sync to u2 config * fix bugs * code refine * update config * customize decoding batch size * update optimizer and lr scheduler * minor * minor * minor * fix bugs of refs * minor * distributed sampler * minor * refine the loader 3 years ago			`subsampling_factor=subsampling_factor,`
			`dtype=np.float32,`
			`load_aux_input=load_aux_input,`
			`load_aux_output=load_aux_output)`
more data utils 3 years ago			`else:`
			`assert NotImplementedError("not impl CustomConverterMulEnc.")`

			`# hack to make batchsize argument as 1`
			`# actual bathsize is included in a list`
[st] Distributed sampler and new dataloader with MIMO (#1239) * update timit result, test=doc_fix * result update * fix bug * add triplet loader * empty preprocess file * sync to u2, updating * sync to u2 config * fix bugs * code refine * update config * customize decoding batch size * update optimizer and lr scheduler * minor * minor * minor * fix bugs of refs * minor * distributed sampler * minor * refine the loader 3 years ago			`# default collate function converts numpy array to paddle tensor`
more data utils 3 years ago			`# we used an empty collate function instead which returns list`
fix dataloader pickle bugs 3 years ago			`self.dataset = TransformDataset(self.minibaches, self.converter,`
			`self.reader)`

[s2t] DataLoader with BatchSampler or DistributeBatchSampler (#1242) * batchsampler or distributebatchsampler * format 3 years ago			`if self.dist_sampler:`
fix batch sampler set_epoch when epcoh start 3 years ago			`self.batch_sampler = DistributedBatchSampler(`
[s2t] DataLoader with BatchSampler or DistributeBatchSampler (#1242) * batchsampler or distributebatchsampler * format 3 years ago			`dataset=self.dataset,`
			`batch_size=1,`
			`shuffle=not self.use_sortagrad if self.train_mode else False,`
			`drop_last=False, )`
			`else:`
fix batch sampler set_epoch when epcoh start 3 years ago			`self.batch_sampler = BatchSampler(`
[s2t] DataLoader with BatchSampler or DistributeBatchSampler (#1242) * batchsampler or distributebatchsampler * format 3 years ago			`dataset=self.dataset,`
			`batch_size=1,`
			`shuffle=not self.use_sortagrad if self.train_mode else False,`
			`drop_last=False, )`
[st] Distributed sampler and new dataloader with MIMO (#1239) * update timit result, test=doc_fix * result update * fix bug * add triplet loader * empty preprocess file * sync to u2, updating * sync to u2 config * fix bugs * code refine * update config * customize decoding batch size * update optimizer and lr scheduler * minor * minor * minor * fix bugs of refs * minor * distributed sampler * minor * refine the loader 3 years ago
			`self.dataloader = DataLoader(`
			`dataset=self.dataset,`
fix batch sampler set_epoch when epcoh start 3 years ago			`batch_sampler=self.batch_sampler,`
fix dataloader pickle bugs 3 years ago			`collate_fn=batch_collate,`
			`num_workers=self.n_iter_processes, )`
more data utils 3 years ago
u2 kaldi mutli process test with batchsize one 3 years ago			`def __len__(self):`
			`return len(self.dataloader)`

			`def __iter__(self):`
			`return self.dataloader.__iter__()`

			`def __call__(self):`
			`return self.__iter__()`

more data utils 3 years ago			`def __repr__(self):`
add dataloader; check augmenter base class type 3 years ago			`echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> "`
			`echo += f"train_mode: {self.train_mode}, "`
			`echo += f"sortagrad: {self.use_sortagrad}, "`
			`echo += f"batch_size: {self.batch_size}, "`
			`echo += f"maxlen_in: {self.maxlen_in}, "`
			`echo += f"maxlen_out: {self.maxlen_out}, "`
			`echo += f"batch_count: {self.batch_count}, "`
			`echo += f"batch_bins: {self.batch_bins}, "`
			`echo += f"batch_frames_in: {self.batch_frames_in}, "`
			`echo += f"batch_frames_out: {self.batch_frames_out}, "`
			`echo += f"batch_frames_inout: {self.batch_frames_inout}, "`
			`echo += f"subsampling_factor: {self.subsampling_factor}, "`
			`echo += f"num_encs: {self.num_encs}, "`
			`echo += f"num_workers: {self.n_iter_processes}, "`
[s2t] DataLoader with BatchSampler or DistributeBatchSampler (#1242) * batchsampler or distributebatchsampler * format 3 years ago			`echo += f"load_aux_input: {self.load_aux_input}, "`
			`echo += f"load_aux_output: {self.load_aux_output}, "`
			`echo += f"dist_sampler: {self.dist_sampler}, "`
fix batch sampler set_epoch when epcoh start 3 years ago			`echo += f"shortest_first: {self.shortest_first}, "`
add dataloader; check augmenter base class type 3 years ago			`echo += f"file: {self.json_file}"`
			`return echo`