# # Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # This file is part of the WebDataset library. # See the LICENSE file for licensing terms (BSD-style). # Modified from https://github.com/webdataset/webdataset # Modified from wenet(https://github.com/wenet-e2e/wenet) """Low level iteration functions for tar archives.""" import random import re import tarfile import braceexpand from . import filters from . import gopen from .handlers import reraise_exception trace = False meta_prefix = "__" meta_suffix = "__" import paddleaudio import paddle import numpy as np AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) def base_plus_ext(path): """Split off all file extensions. Returns base, allext. :param path: path with extensions :param returns: path with all extensions removed """ match = re.match(r"^((?:.*/|)[^.]+)[.]([^/]*)$", path) if not match: return None, None return match.group(1), match.group(2) def valid_sample(sample): """Check whether a sample is valid. :param sample: sample to be checked """ return (sample is not None and isinstance(sample, dict) and len(list(sample.keys())) > 0 and not sample.get("__bad__", False)) # FIXME: UNUSED def shardlist(urls, *, shuffle=False): """Given a list of URLs, yields that list, possibly shuffled.""" if isinstance(urls, str): urls = braceexpand.braceexpand(urls) else: urls = list(urls) if shuffle: random.shuffle(urls) for url in urls: yield dict(url=url) def url_opener(data, handler=reraise_exception, **kw): """Given a stream of url names (packaged in `dict(url=url)`), yield opened streams.""" for sample in data: assert isinstance(sample, dict), sample assert "url" in sample url = sample["url"] try: stream = gopen.gopen(url, **kw) sample.update(stream=stream) yield sample except Exception as exn: exn.args = exn.args + (url, ) if handler(exn): continue else: break def tar_file_iterator(fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception): """Iterate over tar file, yielding filename, content pairs for the given tar stream. :param fileobj: byte stream suitable for tarfile :param skip_meta: regexp for keys that are skipped entirely (Default value = r"__[^/]*__($|/)") """ stream = tarfile.open(fileobj=fileobj, mode="r:*") for tarinfo in stream: fname = tarinfo.name try: if not tarinfo.isreg(): continue if fname is None: continue if ("/" not in fname and fname.startswith(meta_prefix) and fname.endswith(meta_suffix)): # skipping metadata for now continue if skip_meta is not None and re.match(skip_meta, fname): continue name = tarinfo.name pos = name.rfind('.') assert pos > 0 prefix, postfix = name[:pos], name[pos + 1:] if postfix == 'wav': waveform, sample_rate = paddleaudio.backends.soundfile_load( stream.extractfile(tarinfo), normal=False) result = dict( fname=prefix, wav=waveform, sample_rate=sample_rate) else: txt = stream.extractfile(tarinfo).read().decode('utf8').strip() result = dict(fname=prefix, txt=txt) #result = dict(fname=fname, data=data) yield result stream.members = [] except Exception as exn: if hasattr(exn, "args") and len(exn.args) > 0: exn.args = (exn.args[0] + " @ " + str(fileobj), ) + exn.args[1:] if handler(exn): continue else: break del stream def tar_file_and_group_iterator(fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception): """ Expand a stream of open tar files into a stream of tar file contents. And groups the file with same prefix Args: data: Iterable[{src, stream}] Returns: Iterable[{key, wav, txt, sample_rate}] """ stream = tarfile.open(fileobj=fileobj, mode="r:*") prev_prefix = None example = {} valid = True for tarinfo in stream: name = tarinfo.name pos = name.rfind('.') assert pos > 0 prefix, postfix = name[:pos], name[pos + 1:] if prev_prefix is not None and prefix != prev_prefix: example['fname'] = prev_prefix if valid: yield example example = {} valid = True with stream.extractfile(tarinfo) as file_obj: try: if postfix == 'txt': example['txt'] = file_obj.read().decode('utf8').strip() elif postfix in AUDIO_FORMAT_SETS: waveform, sample_rate = paddleaudio.backends.soundfile_load( file_obj, normal=False) waveform = paddle.to_tensor( np.expand_dims(np.array(waveform), 0), dtype=paddle.float32) example['wav'] = waveform example['sample_rate'] = sample_rate else: example[postfix] = file_obj.read() except Exception as exn: if hasattr(exn, "args") and len(exn.args) > 0: exn.args = (exn.args[0] + " @ " + str(fileobj), ) + exn.args[1:] if handler(exn): continue else: break valid = False # logging.warning('error to parse {}'.format(name)) prev_prefix = prefix if prev_prefix is not None: example['fname'] = prev_prefix yield example stream.close() def tar_file_expander(data, handler=reraise_exception): """Expand a stream of open tar files into a stream of tar file contents. This returns an iterator over (filename, file_contents). """ for source in data: url = source["url"] try: assert isinstance(source, dict) assert "stream" in source for sample in tar_file_iterator(source["stream"]): assert (isinstance(sample, dict) and "data" in sample and "fname" in sample) sample["__url__"] = url yield sample except Exception as exn: exn.args = exn.args + (source.get("stream"), source.get("url")) if handler(exn): continue else: break def tar_file_and_group_expander(data, handler=reraise_exception): """Expand a stream of open tar files into a stream of tar file contents. This returns an iterator over (filename, file_contents). """ for source in data: url = source["url"] try: assert isinstance(source, dict) assert "stream" in source for sample in tar_file_and_group_iterator(source["stream"]): assert (isinstance(sample, dict) and "wav" in sample and "txt" in sample and "fname" in sample) sample["__url__"] = url yield sample except Exception as exn: exn.args = exn.args + (source.get("stream"), source.get("url")) if handler(exn): continue else: break def group_by_keys(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None): """Return function over iterator that groups key, value pairs into samples. :param keys: function that splits the key into key and extension (base_plus_ext) :param lcase: convert suffixes to lower case (Default value = True) """ current_sample = None for filesample in data: assert isinstance(filesample, dict) fname, value = filesample["fname"], filesample["data"] prefix, suffix = keys(fname) if trace: print( prefix, suffix, current_sample.keys() if isinstance(current_sample, dict) else None, ) if prefix is None: continue if lcase: suffix = suffix.lower() if current_sample is None or prefix != current_sample["__key__"]: if valid_sample(current_sample): yield current_sample current_sample = dict(__key__=prefix, __url__=filesample["__url__"]) if suffix in current_sample: raise ValueError( f"{fname}: duplicate file name in tar file {suffix} {current_sample.keys()}" ) if suffixes is None or suffix in suffixes: current_sample[suffix] = value if valid_sample(current_sample): yield current_sample def tarfile_samples(src, handler=reraise_exception): streams = url_opener(src, handler=handler) samples = tar_file_and_group_expander(streams, handler=handler) return samples tarfile_to_samples = filters.pipelinefilter(tarfile_samples)