diff --git a/paddlespeech/s2t/frontend/augmentor/impulse_response.py b/paddlespeech/s2t/frontend/augmentor/impulse_response.py index 6cc9c0d43..1a82bb923 100644 --- a/paddlespeech/s2t/frontend/augmentor/impulse_response.py +++ b/paddlespeech/s2t/frontend/augmentor/impulse_response.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Contains the impulse response augmentation model.""" +import jsonlines from paddlespeech.s2t.frontend.audio import AudioSegment from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase -from paddlespeech.s2t.frontend.utility import read_manifest class ImpulseResponseAugmentor(AugmentorBase): @@ -28,7 +28,8 @@ class ImpulseResponseAugmentor(AugmentorBase): def __init__(self, rng, impulse_manifest_path): self._rng = rng - self._impulse_manifest = read_manifest(impulse_manifest_path) + with jsonlines.open(impulse_manifest_path, 'r') as reader: + self._impulse_manifest = list(reader) def __call__(self, x, uttid=None, train=True): if not train: diff --git a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py index 9d6da1a8f..ce0a88186 100644 --- a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py +++ b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Contains the noise perturb augmentation model.""" +import jsonlines from paddlespeech.s2t.frontend.audio import AudioSegment from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase -from paddlespeech.s2t.frontend.utility import read_manifest class NoisePerturbAugmentor(AugmentorBase): @@ -34,7 +34,8 @@ class NoisePerturbAugmentor(AugmentorBase): self._min_snr_dB = min_snr_dB self._max_snr_dB = max_snr_dB self._rng = rng - self._noise_manifest = read_manifest(manifest_path=noise_manifest_path) + with jsonlines.open(noise_manifest_path, 'r') as reader: + self._noise_manifest = list(reader) def __call__(self, x, uttid=None, train=True): if not train: diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py index c55ec9a3d..0a634fc14 100644 --- a/paddlespeech/s2t/frontend/normalizer.py +++ b/paddlespeech/s2t/frontend/normalizer.py @@ -13,10 +13,9 @@ # limitations under the License. """Contains feature normalizers.""" import json - +import jsonlines import numpy as np import paddle -import jsonlines from paddle.io import DataLoader from paddle.io import Dataset @@ -27,24 +26,6 @@ from paddlespeech.s2t.utils.log import Log __all__ = ["FeatureNormalizer"] logger = Log(__name__).getlog() - -def read_manifest(manifest_path): - """Load and parse manifest file. - - Args: - manifest_path ([type]): Manifest file to load and parse. - Raises: - IOError: If failed to parse the manifest. - - Returns: - List[dict]: Manifest parsing results. - """ - - manifest = [] - with jsonlines.open(manifest_path, 'r') as reader: - for json_data in reader: - manifest.append(json_data) - return manifest # https://github.com/PaddlePaddle/Paddle/pull/31481 class CollateFunc(object): @@ -78,10 +59,9 @@ class CollateFunc(object): class AudioDataset(Dataset): def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0): self._rng = rng if rng else np.random.RandomState(random_seed) - manifest = [] + with jsonlines.open(manifest_path, 'r') as reader: - for json_data in reader: - manifest.append(json_data) + manifest = list(reader) if num_samples == -1: sampled_manifest = manifest diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py index 948aba065..ccb767adc 100644 --- a/paddlespeech/s2t/frontend/utility.py +++ b/paddlespeech/s2t/frontend/utility.py @@ -64,27 +64,8 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]: char_list.append(MASKCTC) return char_list - -def read_manifest(manifest_path,): - """Load and parse manifest file. - - Args: - manifest_path ([type]): Manifest file to load and parse. - - Raises: - IOError: If failed to parse the manifest. - - Returns: - List[dict]: Manifest parsing results. - """ - manifest = [] - with jsonlines.open(manifest_path, 'r') as reader: - for json_data in reader: - manifest.append(json_data) - return manifest - -def read_manifest_filter( +def read_manifest( manifest_path, max_input_len=float('inf'), min_input_len=0.0, diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py index 3b5000a28..bda48842a 100644 --- a/paddlespeech/s2t/io/dataloader.py +++ b/paddlespeech/s2t/io/dataloader.py @@ -15,11 +15,11 @@ from typing import Any from typing import Dict from typing import List from typing import Text +import jsonlines import numpy as np from paddle.io import DataLoader -from paddlespeech.s2t.frontend.utility import read_manifest from paddlespeech.s2t.io.batchfy import make_batchset from paddlespeech.s2t.io.converter import CustomConverter from paddlespeech.s2t.io.dataset import TransformDataset @@ -91,7 +91,9 @@ class BatchDataLoader(): self.n_iter_processes = n_iter_processes # read json data - self.data_json = read_manifest(json_file) + with jsonlines.open(json_file, 'r') as reader: + self.data_json = list(reader) + self.feat_dim, self.vocab_size = feat_dim_and_vocab_size( self.data_json, mode='asr') diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index 006cfe041..ba10aebbb 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -14,7 +14,7 @@ # Modified from espnet(https://github.com/espnet/espnet) # Modified from wenet(https://github.com/wenet-e2e/wenet) from typing import Optional - +import jsonlines from paddle.io import Dataset from yacs.config import CfgNode @@ -95,7 +95,7 @@ class ManifestDataset(Dataset): super().__init__() # read manifest - self._manifest = read_manifest_filter( + self._manifest = read_manifest( manifest_path=manifest_path, max_input_len=max_input_len, min_input_len=min_input_len, @@ -184,7 +184,8 @@ class AudioDataset(Dataset): """ assert batch_type in ['static', 'dynamic'] # read manifest - data = read_manifest(data_file) + with jsonlines.open(data_file, 'r') as reader: + data = list(reader) if sort: data = sorted(data, key=lambda x: x["feat_shape"][0]) if raw_wav: diff --git a/paddlespeech/s2t/utils/socket_server.py b/paddlespeech/s2t/utils/socket_server.py index 43b56d723..6371ba85e 100644 --- a/paddlespeech/s2t/utils/socket_server.py +++ b/paddlespeech/s2t/utils/socket_server.py @@ -20,8 +20,7 @@ import time import wave from time import gmtime from time import strftime - -from paddlespeech.s2t.frontend.utility import read_manifest +import jsonlines __all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"] @@ -44,7 +43,8 @@ def warm_up_test(audio_process_handler, num_test_cases, random_seed=0): """Warming-up test.""" - manifest = read_manifest(manifest_path) + with jsonlines.open(manifest_path) as reader: + manifest = list(reader) rng = random.Random(random_seed) samples = rng.sample(manifest, num_test_cases) for idx, sample in enumerate(samples): diff --git a/utils/dump_manifest.py b/utils/dump_manifest.py index b5f7b64a4..d602571d5 100755 --- a/utils/dump_manifest.py +++ b/utils/dump_manifest.py @@ -16,8 +16,7 @@ import argparse from pathlib import Path from typing import Union - -from paddlespeech.s2t.frontend.utility import read_manifest +import jsonlines key_whitelist = set(['feat', 'text', 'syllable', 'phone']) filename = { @@ -32,7 +31,10 @@ def dump_manifest(manifest_path, output_dir: Union[str, Path]): output_dir = Path(output_dir).expanduser() manifest_path = Path(manifest_path).expanduser() - manifest_jsons = read_manifest(manifest_path) + + with jsonlines.open(str(manifest_path), 'r') as reader: + manifest_jsons = list(reader) + first_line = manifest_jsons[0] file_map = {} diff --git a/utils/format_data.py b/utils/format_data.py index 2fa1924a0..437d7e0f0 100755 --- a/utils/format_data.py +++ b/utils/format_data.py @@ -15,11 +15,11 @@ """format manifest with more metadata.""" import argparse import functools +import jsonlines import json from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn -from paddlespeech.s2t.frontend.utility import read_manifest from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import print_arguments @@ -71,7 +71,9 @@ def main(): # } count = 0 for manifest_path in args.manifest_paths: - manifest_jsons = read_manifest(manifest_path) + with jsonlines.open(str(manifest_path), 'r') as reader: + manifest_jsons = list(reader) + for line_json in manifest_jsons: output_json = { "input": [], diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py index e0b5ece37..dd9dab42c 100755 --- a/utils/format_triplet_data.py +++ b/utils/format_triplet_data.py @@ -16,10 +16,10 @@ import argparse import functools import json +import jsonlines from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn -from paddlespeech.s2t.frontend.utility import read_manifest from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import print_arguments @@ -63,7 +63,8 @@ def main(): count = 0 for manifest_path in args.manifest_paths: - manifest_jsons = read_manifest(manifest_path) + with jsonlines.open(str(manifest_path), 'r') as reader: + manifest_jsons = list(reader) for line_json in manifest_jsons: # text: translation text, text1: transcript text. # Currently only support joint-vocab, will add separate vocabs setting. diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py index b409236fc..0cfb2450e 100755 --- a/utils/manifest_key_value.py +++ b/utils/manifest_key_value.py @@ -3,10 +3,10 @@ import argparse import functools from pathlib import Path +import jsonlines from utils.utility import add_arguments from utils.utility import print_arguments -from utils.utility import read_manifest def main(args): @@ -19,7 +19,8 @@ def main(args): dur_scp = outdir / 'duration' text_scp = outdir / 'text' - manifest_jsons = read_manifest(args.manifest_path) + with jsonlines.open(args.manifest_path, 'r') as reader: + manifest_jsons = list(reader) with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open( 'w') as ftxt: diff --git a/utils/utility.py b/utils/utility.py index 29fda2685..b3523b383 100755 --- a/utils/utility.py +++ b/utils/utility.py @@ -22,32 +22,10 @@ from typing import Text __all__ = [ "check_md5sum", "getfile_insensitive", "download_multi", "download", "unpack", "unzip", "md5file", "print_arguments", "add_arguments", - "read_manifest", "get_commandline_args" + "get_commandline_args" ] -def read_manifest(manifest_path): - """Load and parse manifest file. - Args: - manifest_path ([type]): Manifest file to load and parse. - - Raises: - IOError: If failed to parse the manifest. - - Returns: - List[dict]: Manifest parsing results. - """ - - manifest = [] - for json_line in open(manifest_path, 'r'): - try: - json_data = json.loads(json_line) - manifest.append(json_data) - except Exception as e: - raise IOError("Error reading manifest: %s" % str(e)) - return manifest - - def get_commandline_args(): extra_chars = [ " ",