jsonlines reade manifest file

4 years ago · d395c2b8e3
parent 7554b6107a
commit d395c2b8e3
12 changed files with 37 additions and 87 deletions
--- a/paddlespeech/s2t/frontend/augmentor/impulse_response.py
+++ b/paddlespeech/s2t/frontend/augmentor/impulse_response.py
@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the impulse response augmentation model."""
+import jsonlines
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
-from paddlespeech.s2t.frontend.utility import read_manifest


 class ImpulseResponseAugmentor(AugmentorBase):
@ -28,7 +28,8 @@ class ImpulseResponseAugmentor(AugmentorBase):

    def __init__(self, rng, impulse_manifest_path):
        self._rng = rng
-        self._impulse_manifest = read_manifest(impulse_manifest_path)
+        with jsonlines.open(impulse_manifest_path, 'r') as reader:
+            self._impulse_manifest = list(reader)

    def __call__(self, x, uttid=None, train=True):
        if not train:
--- a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
+++ b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the noise perturb augmentation model."""
+import jsonlines
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
-from paddlespeech.s2t.frontend.utility import read_manifest


 class NoisePerturbAugmentor(AugmentorBase):
@ -34,7 +34,8 @@ class NoisePerturbAugmentor(AugmentorBase):
        self._min_snr_dB = min_snr_dB
        self._max_snr_dB = max_snr_dB
        self._rng = rng
-        self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
+        with jsonlines.open(noise_manifest_path, 'r') as reader:
+            self._noise_manifest = list(reader)

    def __call__(self, x, uttid=None, train=True):
        if not train:
--- a/paddlespeech/s2t/frontend/normalizer.py
+++ b/paddlespeech/s2t/frontend/normalizer.py
@ -13,10 +13,9 @@
 # limitations under the License.
 """Contains feature normalizers."""
 import json
-
+import jsonlines
 import numpy as np
 import paddle
-import jsonlines
 from paddle.io import DataLoader
 from paddle.io import Dataset

@ -27,24 +26,6 @@ from paddlespeech.s2t.utils.log import Log
 __all__ = ["FeatureNormalizer"]

 logger = Log(__name__).getlog()
-
-def read_manifest(manifest_path):
-     """Load and parse manifest file.
- 
-     Args:
-         manifest_path ([type]): Manifest file to load and parse.
-     Raises:
-         IOError: If failed to parse the manifest.
- 
-     Returns:
-         List[dict]: Manifest parsing results.
-     """
- 
-     manifest = []
-     with jsonlines.open(manifest_path, 'r') as reader:
-         for json_data in reader:
-            manifest.append(json_data)
-     return manifest
 
 # https://github.com/PaddlePaddle/Paddle/pull/31481
 class CollateFunc(object):
@ -78,10 +59,9 @@ class CollateFunc(object):
 class AudioDataset(Dataset):
    def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
        self._rng = rng if rng else np.random.RandomState(random_seed)
-        manifest = []
+
        with jsonlines.open(manifest_path, 'r') as reader:
-         for json_data in reader:
-            manifest.append(json_data)
+            manifest = list(reader)
        
        if num_samples == -1:
            sampled_manifest = manifest
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@ -64,27 +64,8 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
        char_list.append(MASKCTC)
    return char_list

-
-def read_manifest(manifest_path,):
-    """Load and parse manifest file.
-
-    Args:
-        manifest_path ([type]): Manifest file to load and parse.
-
-    Raises:
-        IOError: If failed to parse the manifest.
-
-    Returns:
-        List[dict]: Manifest parsing results.
-    """
-    manifest = []
-    with jsonlines.open(manifest_path, 'r') as reader:
-        for json_data in reader:
-            manifest.append(json_data)
-    return manifest
-
    
-def read_manifest_filter(
+def read_manifest(
        manifest_path,
        max_input_len=float('inf'),
        min_input_len=0.0,
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@ -15,11 +15,11 @@ from typing import Any
 from typing import Dict
 from typing import List
 from typing import Text
+import jsonlines

 import numpy as np
 from paddle.io import DataLoader

-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.io.batchfy import make_batchset
 from paddlespeech.s2t.io.converter import CustomConverter
 from paddlespeech.s2t.io.dataset import TransformDataset
@ -91,7 +91,9 @@ class BatchDataLoader():
        self.n_iter_processes = n_iter_processes

        # read json data
-        self.data_json = read_manifest(json_file)
+        with jsonlines.open(json_file, 'r') as reader:
+            self.data_json = list(reader)
+            
        self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
            self.data_json, mode='asr')

--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@ -14,7 +14,7 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 from typing import Optional
-
+import jsonlines
 from paddle.io import Dataset
 from yacs.config import CfgNode

@ -95,7 +95,7 @@ class ManifestDataset(Dataset):
        super().__init__()

        # read manifest
-        self._manifest = read_manifest_filter(
+        self._manifest = read_manifest(
            manifest_path=manifest_path,
            max_input_len=max_input_len,
            min_input_len=min_input_len,
@ -184,7 +184,8 @@ class AudioDataset(Dataset):
        """
        assert batch_type in ['static', 'dynamic']
        # read manifest
-        data = read_manifest(data_file)
+        with jsonlines.open(data_file, 'r') as reader:
+            data = list(reader)
        if sort:
            data = sorted(data, key=lambda x: x["feat_shape"][0])
        if raw_wav:
--- a/paddlespeech/s2t/utils/socket_server.py
+++ b/paddlespeech/s2t/utils/socket_server.py
@ -20,8 +20,7 @@ import time
 import wave
 from time import gmtime
 from time import strftime
-
-from paddlespeech.s2t.frontend.utility import read_manifest
+import jsonlines

 __all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"]

@ -44,7 +43,8 @@ def warm_up_test(audio_process_handler,
                 num_test_cases,
                 random_seed=0):
    """Warming-up test."""
-    manifest = read_manifest(manifest_path)
+    with jsonlines.open(manifest_path) as reader:
+        manifest = list(reader)
    rng = random.Random(random_seed)
    samples = rng.sample(manifest, num_test_cases)
    for idx, sample in enumerate(samples):
--- a/utils/dump_manifest.py
+++ b/utils/dump_manifest.py
@ -16,8 +16,7 @@
 import argparse
 from pathlib import Path
 from typing import Union
-
-from paddlespeech.s2t.frontend.utility import read_manifest
+import jsonlines

 key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
 filename = {
@ -32,7 +31,10 @@ def dump_manifest(manifest_path, output_dir: Union[str, Path]):

    output_dir = Path(output_dir).expanduser()
    manifest_path = Path(manifest_path).expanduser()
-    manifest_jsons = read_manifest(manifest_path)
+
+    with jsonlines.open(str(manifest_path), 'r') as reader:
+        manifest_jsons = list(reader)
+        
    first_line = manifest_jsons[0]
    file_map = {}

--- a/utils/format_data.py
+++ b/utils/format_data.py
@ -15,11 +15,11 @@
 """format manifest with more metadata."""
 import argparse
 import functools
+import jsonlines
 import json

 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.io.utility import feat_type
 from paddlespeech.s2t.utils.utility import add_arguments
 from paddlespeech.s2t.utils.utility import print_arguments
@ -71,7 +71,9 @@ def main():
    # }
    count = 0
    for manifest_path in args.manifest_paths:
-        manifest_jsons = read_manifest(manifest_path)
+        with jsonlines.open(str(manifest_path), 'r') as reader:
+            manifest_jsons = list(reader)
+        
        for line_json in manifest_jsons:
            output_json = {
                "input": [],
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
@ -16,10 +16,10 @@
 import argparse
 import functools
 import json
+import jsonlines

 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.io.utility import feat_type
 from paddlespeech.s2t.utils.utility import add_arguments
 from paddlespeech.s2t.utils.utility import print_arguments
@ -63,7 +63,8 @@ def main():

    count = 0
    for manifest_path in args.manifest_paths:
-        manifest_jsons = read_manifest(manifest_path)
+        with jsonlines.open(str(manifest_path), 'r') as reader:
+            manifest_jsons = list(reader)
        for line_json in manifest_jsons:
            # text: translation text, text1: transcript text.
            # Currently only support joint-vocab, will add separate vocabs setting.
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@ -3,10 +3,10 @@
 import argparse
 import functools
 from pathlib import Path
+import jsonlines

 from utils.utility import add_arguments
 from utils.utility import print_arguments
-from utils.utility import read_manifest


 def main(args):
@ -19,7 +19,8 @@ def main(args):
    dur_scp = outdir / 'duration'
    text_scp = outdir / 'text'

-    manifest_jsons = read_manifest(args.manifest_path)
+    with jsonlines.open(args.manifest_path, 'r') as reader:
+        manifest_jsons = list(reader)

    with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
            'w') as ftxt:
--- a/utils/utility.py
+++ b/utils/utility.py
@ -22,32 +22,10 @@ from typing import Text
 __all__ = [
    "check_md5sum", "getfile_insensitive", "download_multi", "download",
    "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
-    "read_manifest", "get_commandline_args"
+    "get_commandline_args"
 ]


-def read_manifest(manifest_path):
-    """Load and parse manifest file.
-    Args:
-        manifest_path ([type]): Manifest file to load and parse.
-
-    Raises:
-        IOError: If failed to parse the manifest.
-
-    Returns:
-        List[dict]: Manifest parsing results.
-    """
-
-    manifest = []
-    for json_line in open(manifest_path, 'r'):
-        try:
-            json_data = json.loads(json_line)
-            manifest.append(json_data)
-        except Exception as e:
-            raise IOError("Error reading manifest: %s" % str(e))
-    return manifest
-
-
 def get_commandline_args():
    extra_chars = [
        " ",