jsonlines reade manifest file

pull/1054/head
Hui Zhang 4 years ago
parent 7554b6107a
commit d395c2b8e3

@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains the impulse response augmentation model."""
import jsonlines
from paddlespeech.s2t.frontend.audio import AudioSegment
from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
from paddlespeech.s2t.frontend.utility import read_manifest
class ImpulseResponseAugmentor(AugmentorBase):
@ -28,7 +28,8 @@ class ImpulseResponseAugmentor(AugmentorBase):
def __init__(self, rng, impulse_manifest_path):
self._rng = rng
self._impulse_manifest = read_manifest(impulse_manifest_path)
with jsonlines.open(impulse_manifest_path, 'r') as reader:
self._impulse_manifest = list(reader)
def __call__(self, x, uttid=None, train=True):
if not train:

@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains the noise perturb augmentation model."""
import jsonlines
from paddlespeech.s2t.frontend.audio import AudioSegment
from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
from paddlespeech.s2t.frontend.utility import read_manifest
class NoisePerturbAugmentor(AugmentorBase):
@ -34,7 +34,8 @@ class NoisePerturbAugmentor(AugmentorBase):
self._min_snr_dB = min_snr_dB
self._max_snr_dB = max_snr_dB
self._rng = rng
self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
with jsonlines.open(noise_manifest_path, 'r') as reader:
self._noise_manifest = list(reader)
def __call__(self, x, uttid=None, train=True):
if not train:

@ -13,10 +13,9 @@
# limitations under the License.
"""Contains feature normalizers."""
import json
import jsonlines
import numpy as np
import paddle
import jsonlines
from paddle.io import DataLoader
from paddle.io import Dataset
@ -27,24 +26,6 @@ from paddlespeech.s2t.utils.log import Log
__all__ = ["FeatureNormalizer"]
logger = Log(__name__).getlog()
def read_manifest(manifest_path):
"""Load and parse manifest file.
Args:
manifest_path ([type]): Manifest file to load and parse.
Raises:
IOError: If failed to parse the manifest.
Returns:
List[dict]: Manifest parsing results.
"""
manifest = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest.append(json_data)
return manifest
# https://github.com/PaddlePaddle/Paddle/pull/31481
class CollateFunc(object):
@ -78,10 +59,9 @@ class CollateFunc(object):
class AudioDataset(Dataset):
def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
self._rng = rng if rng else np.random.RandomState(random_seed)
manifest = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest.append(json_data)
manifest = list(reader)
if num_samples == -1:
sampled_manifest = manifest

@ -64,27 +64,8 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
char_list.append(MASKCTC)
return char_list
def read_manifest(manifest_path,):
"""Load and parse manifest file.
Args:
manifest_path ([type]): Manifest file to load and parse.
Raises:
IOError: If failed to parse the manifest.
Returns:
List[dict]: Manifest parsing results.
"""
manifest = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest.append(json_data)
return manifest
def read_manifest_filter(
def read_manifest(
manifest_path,
max_input_len=float('inf'),
min_input_len=0.0,

@ -15,11 +15,11 @@ from typing import Any
from typing import Dict
from typing import List
from typing import Text
import jsonlines
import numpy as np
from paddle.io import DataLoader
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.io.batchfy import make_batchset
from paddlespeech.s2t.io.converter import CustomConverter
from paddlespeech.s2t.io.dataset import TransformDataset
@ -91,7 +91,9 @@ class BatchDataLoader():
self.n_iter_processes = n_iter_processes
# read json data
self.data_json = read_manifest(json_file)
with jsonlines.open(json_file, 'r') as reader:
self.data_json = list(reader)
self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
self.data_json, mode='asr')

@ -14,7 +14,7 @@
# Modified from espnet(https://github.com/espnet/espnet)
# Modified from wenet(https://github.com/wenet-e2e/wenet)
from typing import Optional
import jsonlines
from paddle.io import Dataset
from yacs.config import CfgNode
@ -95,7 +95,7 @@ class ManifestDataset(Dataset):
super().__init__()
# read manifest
self._manifest = read_manifest_filter(
self._manifest = read_manifest(
manifest_path=manifest_path,
max_input_len=max_input_len,
min_input_len=min_input_len,
@ -184,7 +184,8 @@ class AudioDataset(Dataset):
"""
assert batch_type in ['static', 'dynamic']
# read manifest
data = read_manifest(data_file)
with jsonlines.open(data_file, 'r') as reader:
data = list(reader)
if sort:
data = sorted(data, key=lambda x: x["feat_shape"][0])
if raw_wav:

@ -20,8 +20,7 @@ import time
import wave
from time import gmtime
from time import strftime
from paddlespeech.s2t.frontend.utility import read_manifest
import jsonlines
__all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"]
@ -44,7 +43,8 @@ def warm_up_test(audio_process_handler,
num_test_cases,
random_seed=0):
"""Warming-up test."""
manifest = read_manifest(manifest_path)
with jsonlines.open(manifest_path) as reader:
manifest = list(reader)
rng = random.Random(random_seed)
samples = rng.sample(manifest, num_test_cases)
for idx, sample in enumerate(samples):

@ -16,8 +16,7 @@
import argparse
from pathlib import Path
from typing import Union
from paddlespeech.s2t.frontend.utility import read_manifest
import jsonlines
key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
filename = {
@ -32,7 +31,10 @@ def dump_manifest(manifest_path, output_dir: Union[str, Path]):
output_dir = Path(output_dir).expanduser()
manifest_path = Path(manifest_path).expanduser()
manifest_jsons = read_manifest(manifest_path)
with jsonlines.open(str(manifest_path), 'r') as reader:
manifest_jsons = list(reader)
first_line = manifest_jsons[0]
file_map = {}

@ -15,11 +15,11 @@
"""format manifest with more metadata."""
import argparse
import functools
import jsonlines
import json
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
@ -71,7 +71,9 @@ def main():
# }
count = 0
for manifest_path in args.manifest_paths:
manifest_jsons = read_manifest(manifest_path)
with jsonlines.open(str(manifest_path), 'r') as reader:
manifest_jsons = list(reader)
for line_json in manifest_jsons:
output_json = {
"input": [],

@ -16,10 +16,10 @@
import argparse
import functools
import json
import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
@ -63,7 +63,8 @@ def main():
count = 0
for manifest_path in args.manifest_paths:
manifest_jsons = read_manifest(manifest_path)
with jsonlines.open(str(manifest_path), 'r') as reader:
manifest_jsons = list(reader)
for line_json in manifest_jsons:
# text: translation text, text1: transcript text.
# Currently only support joint-vocab, will add separate vocabs setting.

@ -3,10 +3,10 @@
import argparse
import functools
from pathlib import Path
import jsonlines
from utils.utility import add_arguments
from utils.utility import print_arguments
from utils.utility import read_manifest
def main(args):
@ -19,7 +19,8 @@ def main(args):
dur_scp = outdir / 'duration'
text_scp = outdir / 'text'
manifest_jsons = read_manifest(args.manifest_path)
with jsonlines.open(args.manifest_path, 'r') as reader:
manifest_jsons = list(reader)
with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
'w') as ftxt:

@ -22,32 +22,10 @@ from typing import Text
__all__ = [
"check_md5sum", "getfile_insensitive", "download_multi", "download",
"unpack", "unzip", "md5file", "print_arguments", "add_arguments",
"read_manifest", "get_commandline_args"
"get_commandline_args"
]
def read_manifest(manifest_path):
"""Load and parse manifest file.
Args:
manifest_path ([type]): Manifest file to load and parse.
Raises:
IOError: If failed to parse the manifest.
Returns:
List[dict]: Manifest parsing results.
"""
manifest = []
for json_line in open(manifest_path, 'r'):
try:
json_data = json.loads(json_line)
manifest.append(json_data)
except Exception as e:
raise IOError("Error reading manifest: %s" % str(e))
return manifest
def get_commandline_args():
extra_chars = [
" ",

Loading…
Cancel
Save