jsonlines reade manifest file

pull/1054/head
Hui Zhang 4 years ago
parent 7554b6107a
commit d395c2b8e3

@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Contains the impulse response augmentation model.""" """Contains the impulse response augmentation model."""
import jsonlines
from paddlespeech.s2t.frontend.audio import AudioSegment from paddlespeech.s2t.frontend.audio import AudioSegment
from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
from paddlespeech.s2t.frontend.utility import read_manifest
class ImpulseResponseAugmentor(AugmentorBase): class ImpulseResponseAugmentor(AugmentorBase):
@ -28,7 +28,8 @@ class ImpulseResponseAugmentor(AugmentorBase):
def __init__(self, rng, impulse_manifest_path): def __init__(self, rng, impulse_manifest_path):
self._rng = rng self._rng = rng
self._impulse_manifest = read_manifest(impulse_manifest_path) with jsonlines.open(impulse_manifest_path, 'r') as reader:
self._impulse_manifest = list(reader)
def __call__(self, x, uttid=None, train=True): def __call__(self, x, uttid=None, train=True):
if not train: if not train:

@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Contains the noise perturb augmentation model.""" """Contains the noise perturb augmentation model."""
import jsonlines
from paddlespeech.s2t.frontend.audio import AudioSegment from paddlespeech.s2t.frontend.audio import AudioSegment
from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
from paddlespeech.s2t.frontend.utility import read_manifest
class NoisePerturbAugmentor(AugmentorBase): class NoisePerturbAugmentor(AugmentorBase):
@ -34,7 +34,8 @@ class NoisePerturbAugmentor(AugmentorBase):
self._min_snr_dB = min_snr_dB self._min_snr_dB = min_snr_dB
self._max_snr_dB = max_snr_dB self._max_snr_dB = max_snr_dB
self._rng = rng self._rng = rng
self._noise_manifest = read_manifest(manifest_path=noise_manifest_path) with jsonlines.open(noise_manifest_path, 'r') as reader:
self._noise_manifest = list(reader)
def __call__(self, x, uttid=None, train=True): def __call__(self, x, uttid=None, train=True):
if not train: if not train:

@ -13,10 +13,9 @@
# limitations under the License. # limitations under the License.
"""Contains feature normalizers.""" """Contains feature normalizers."""
import json import json
import jsonlines
import numpy as np import numpy as np
import paddle import paddle
import jsonlines
from paddle.io import DataLoader from paddle.io import DataLoader
from paddle.io import Dataset from paddle.io import Dataset
@ -27,24 +26,6 @@ from paddlespeech.s2t.utils.log import Log
__all__ = ["FeatureNormalizer"] __all__ = ["FeatureNormalizer"]
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
def read_manifest(manifest_path):
"""Load and parse manifest file.
Args:
manifest_path ([type]): Manifest file to load and parse.
Raises:
IOError: If failed to parse the manifest.
Returns:
List[dict]: Manifest parsing results.
"""
manifest = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest.append(json_data)
return manifest
# https://github.com/PaddlePaddle/Paddle/pull/31481 # https://github.com/PaddlePaddle/Paddle/pull/31481
class CollateFunc(object): class CollateFunc(object):
@ -78,10 +59,9 @@ class CollateFunc(object):
class AudioDataset(Dataset): class AudioDataset(Dataset):
def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0): def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
self._rng = rng if rng else np.random.RandomState(random_seed) self._rng = rng if rng else np.random.RandomState(random_seed)
manifest = []
with jsonlines.open(manifest_path, 'r') as reader: with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader: manifest = list(reader)
manifest.append(json_data)
if num_samples == -1: if num_samples == -1:
sampled_manifest = manifest sampled_manifest = manifest

@ -64,27 +64,8 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
char_list.append(MASKCTC) char_list.append(MASKCTC)
return char_list return char_list
def read_manifest(manifest_path,):
"""Load and parse manifest file.
Args:
manifest_path ([type]): Manifest file to load and parse.
Raises:
IOError: If failed to parse the manifest.
Returns:
List[dict]: Manifest parsing results.
"""
manifest = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest.append(json_data)
return manifest
def read_manifest_filter( def read_manifest(
manifest_path, manifest_path,
max_input_len=float('inf'), max_input_len=float('inf'),
min_input_len=0.0, min_input_len=0.0,

@ -15,11 +15,11 @@ from typing import Any
from typing import Dict from typing import Dict
from typing import List from typing import List
from typing import Text from typing import Text
import jsonlines
import numpy as np import numpy as np
from paddle.io import DataLoader from paddle.io import DataLoader
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.io.batchfy import make_batchset from paddlespeech.s2t.io.batchfy import make_batchset
from paddlespeech.s2t.io.converter import CustomConverter from paddlespeech.s2t.io.converter import CustomConverter
from paddlespeech.s2t.io.dataset import TransformDataset from paddlespeech.s2t.io.dataset import TransformDataset
@ -91,7 +91,9 @@ class BatchDataLoader():
self.n_iter_processes = n_iter_processes self.n_iter_processes = n_iter_processes
# read json data # read json data
self.data_json = read_manifest(json_file) with jsonlines.open(json_file, 'r') as reader:
self.data_json = list(reader)
self.feat_dim, self.vocab_size = feat_dim_and_vocab_size( self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
self.data_json, mode='asr') self.data_json, mode='asr')

@ -14,7 +14,7 @@
# Modified from espnet(https://github.com/espnet/espnet) # Modified from espnet(https://github.com/espnet/espnet)
# Modified from wenet(https://github.com/wenet-e2e/wenet) # Modified from wenet(https://github.com/wenet-e2e/wenet)
from typing import Optional from typing import Optional
import jsonlines
from paddle.io import Dataset from paddle.io import Dataset
from yacs.config import CfgNode from yacs.config import CfgNode
@ -95,7 +95,7 @@ class ManifestDataset(Dataset):
super().__init__() super().__init__()
# read manifest # read manifest
self._manifest = read_manifest_filter( self._manifest = read_manifest(
manifest_path=manifest_path, manifest_path=manifest_path,
max_input_len=max_input_len, max_input_len=max_input_len,
min_input_len=min_input_len, min_input_len=min_input_len,
@ -184,7 +184,8 @@ class AudioDataset(Dataset):
""" """
assert batch_type in ['static', 'dynamic'] assert batch_type in ['static', 'dynamic']
# read manifest # read manifest
data = read_manifest(data_file) with jsonlines.open(data_file, 'r') as reader:
data = list(reader)
if sort: if sort:
data = sorted(data, key=lambda x: x["feat_shape"][0]) data = sorted(data, key=lambda x: x["feat_shape"][0])
if raw_wav: if raw_wav:

@ -20,8 +20,7 @@ import time
import wave import wave
from time import gmtime from time import gmtime
from time import strftime from time import strftime
import jsonlines
from paddlespeech.s2t.frontend.utility import read_manifest
__all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"] __all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"]
@ -44,7 +43,8 @@ def warm_up_test(audio_process_handler,
num_test_cases, num_test_cases,
random_seed=0): random_seed=0):
"""Warming-up test.""" """Warming-up test."""
manifest = read_manifest(manifest_path) with jsonlines.open(manifest_path) as reader:
manifest = list(reader)
rng = random.Random(random_seed) rng = random.Random(random_seed)
samples = rng.sample(manifest, num_test_cases) samples = rng.sample(manifest, num_test_cases)
for idx, sample in enumerate(samples): for idx, sample in enumerate(samples):

@ -16,8 +16,7 @@
import argparse import argparse
from pathlib import Path from pathlib import Path
from typing import Union from typing import Union
import jsonlines
from paddlespeech.s2t.frontend.utility import read_manifest
key_whitelist = set(['feat', 'text', 'syllable', 'phone']) key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
filename = { filename = {
@ -32,7 +31,10 @@ def dump_manifest(manifest_path, output_dir: Union[str, Path]):
output_dir = Path(output_dir).expanduser() output_dir = Path(output_dir).expanduser()
manifest_path = Path(manifest_path).expanduser() manifest_path = Path(manifest_path).expanduser()
manifest_jsons = read_manifest(manifest_path)
with jsonlines.open(str(manifest_path), 'r') as reader:
manifest_jsons = list(reader)
first_line = manifest_jsons[0] first_line = manifest_jsons[0]
file_map = {} file_map = {}

@ -15,11 +15,11 @@
"""format manifest with more metadata.""" """format manifest with more metadata."""
import argparse import argparse
import functools import functools
import jsonlines
import json import json
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.s2t.utils.utility import print_arguments
@ -71,7 +71,9 @@ def main():
# } # }
count = 0 count = 0
for manifest_path in args.manifest_paths: for manifest_path in args.manifest_paths:
manifest_jsons = read_manifest(manifest_path) with jsonlines.open(str(manifest_path), 'r') as reader:
manifest_jsons = list(reader)
for line_json in manifest_jsons: for line_json in manifest_jsons:
output_json = { output_json = {
"input": [], "input": [],

@ -16,10 +16,10 @@
import argparse import argparse
import functools import functools
import json import json
import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.s2t.utils.utility import print_arguments
@ -63,7 +63,8 @@ def main():
count = 0 count = 0
for manifest_path in args.manifest_paths: for manifest_path in args.manifest_paths:
manifest_jsons = read_manifest(manifest_path) with jsonlines.open(str(manifest_path), 'r') as reader:
manifest_jsons = list(reader)
for line_json in manifest_jsons: for line_json in manifest_jsons:
# text: translation text, text1: transcript text. # text: translation text, text1: transcript text.
# Currently only support joint-vocab, will add separate vocabs setting. # Currently only support joint-vocab, will add separate vocabs setting.

@ -3,10 +3,10 @@
import argparse import argparse
import functools import functools
from pathlib import Path from pathlib import Path
import jsonlines
from utils.utility import add_arguments from utils.utility import add_arguments
from utils.utility import print_arguments from utils.utility import print_arguments
from utils.utility import read_manifest
def main(args): def main(args):
@ -19,7 +19,8 @@ def main(args):
dur_scp = outdir / 'duration' dur_scp = outdir / 'duration'
text_scp = outdir / 'text' text_scp = outdir / 'text'
manifest_jsons = read_manifest(args.manifest_path) with jsonlines.open(args.manifest_path, 'r') as reader:
manifest_jsons = list(reader)
with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open( with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
'w') as ftxt: 'w') as ftxt:

@ -22,32 +22,10 @@ from typing import Text
__all__ = [ __all__ = [
"check_md5sum", "getfile_insensitive", "download_multi", "download", "check_md5sum", "getfile_insensitive", "download_multi", "download",
"unpack", "unzip", "md5file", "print_arguments", "add_arguments", "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
"read_manifest", "get_commandline_args" "get_commandline_args"
] ]
def read_manifest(manifest_path):
"""Load and parse manifest file.
Args:
manifest_path ([type]): Manifest file to load and parse.
Raises:
IOError: If failed to parse the manifest.
Returns:
List[dict]: Manifest parsing results.
"""
manifest = []
for json_line in open(manifest_path, 'r'):
try:
json_data = json.loads(json_line)
manifest.append(json_data)
except Exception as e:
raise IOError("Error reading manifest: %s" % str(e))
return manifest
def get_commandline_args(): def get_commandline_args():
extra_chars = [ extra_chars = [
" ", " ",

Loading…
Cancel
Save