From 7554b6107aa19d29195e8dc908c8bc89e208cdc3 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 30 Nov 2021 08:00:22 +0000 Subject: [PATCH 1/3] using visualdl; fix read_manifest --- paddlespeech/s2t/exps/u2/model.py | 10 +++---- paddlespeech/s2t/exps/u2_kaldi/model.py | 10 +++---- paddlespeech/s2t/exps/u2_st/model.py | 10 +++---- paddlespeech/s2t/frontend/normalizer.py | 27 ++++++++++++++++--- paddlespeech/s2t/frontend/utility.py | 22 +++++++++++++-- paddlespeech/s2t/io/dataset.py | 2 +- paddlespeech/s2t/training/trainer.py | 9 +++---- .../training/trainer.py | 10 +++---- requirements.txt | 1 - utils/build_vocab.py | 14 +++++++--- utils/utility.py | 1 + 11 files changed, 80 insertions(+), 36 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index b6dbcf44..5dbb72f4 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -128,8 +128,9 @@ class U2Trainer(Trainer): if dist.get_rank() == 0 and self.visualizer: losses_np_v = losses_np.copy() losses_np_v.update({"lr": self.lr_scheduler()}) - self.visualizer.add_scalars("step", losses_np_v, - self.iteration - 1) + for key, val in losses_np_v.items(): + self.visualizer.add_scalar(tag='train/'+key, value=val, step=self.iteration-1) + @paddle.no_grad() def valid(self): @@ -237,9 +238,8 @@ class U2Trainer(Trainer): logger.info( 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) if self.visualizer: - self.visualizer.add_scalars( - 'epoch', {'cv_loss': cv_loss, - 'lr': self.lr_scheduler()}, self.epoch) + self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch) + self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) self.save(tag=self.epoch, infos={'val_loss': cv_loss}) self.new_epoch() diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index c23b4c24..a3f45d8e 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -131,8 +131,8 @@ class U2Trainer(Trainer): if dist.get_rank() == 0 and self.visualizer: losses_np_v = losses_np.copy() losses_np_v.update({"lr": self.lr_scheduler()}) - self.visualizer.add_scalars("step", losses_np_v, - self.iteration - 1) + for key, val in losses_np_v.items(): + self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1) @paddle.no_grad() def valid(self): @@ -222,9 +222,9 @@ class U2Trainer(Trainer): logger.info( 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) if self.visualizer: - self.visualizer.add_scalars( - 'epoch', {'cv_loss': cv_loss, - 'lr': self.lr_scheduler()}, self.epoch) + self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch) + self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) + self.save(tag=self.epoch, infos={'val_loss': cv_loss}) self.new_epoch() diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index 034463fe..771203cf 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -138,8 +138,8 @@ class U2STTrainer(Trainer): if dist.get_rank() == 0 and self.visualizer: losses_np_v = losses_np.copy() losses_np_v.update({"lr": self.lr_scheduler()}) - self.visualizer.add_scalars("step", losses_np_v, - self.iteration - 1) + for key, val in losses_np_v.items(): + self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1) @paddle.no_grad() def valid(self): @@ -235,9 +235,9 @@ class U2STTrainer(Trainer): logger.info( 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) if self.visualizer: - self.visualizer.add_scalars( - 'epoch', {'cv_loss': cv_loss, - 'lr': self.lr_scheduler()}, self.epoch) + self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch) + self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) + self.save(tag=self.epoch, infos={'val_loss': cv_loss}) self.new_epoch() diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py index a29cddc3..c55ec9a3 100644 --- a/paddlespeech/s2t/frontend/normalizer.py +++ b/paddlespeech/s2t/frontend/normalizer.py @@ -16,19 +16,36 @@ import json import numpy as np import paddle +import jsonlines from paddle.io import DataLoader from paddle.io import Dataset from paddlespeech.s2t.frontend.audio import AudioSegment from paddlespeech.s2t.frontend.utility import load_cmvn -from paddlespeech.s2t.frontend.utility import read_manifest from paddlespeech.s2t.utils.log import Log __all__ = ["FeatureNormalizer"] logger = Log(__name__).getlog() - +def read_manifest(manifest_path): + """Load and parse manifest file. + + Args: + manifest_path ([type]): Manifest file to load and parse. + Raises: + IOError: If failed to parse the manifest. + + Returns: + List[dict]: Manifest parsing results. + """ + + manifest = [] + with jsonlines.open(manifest_path, 'r') as reader: + for json_data in reader: + manifest.append(json_data) + return manifest + # https://github.com/PaddlePaddle/Paddle/pull/31481 class CollateFunc(object): def __init__(self, feature_func): @@ -61,7 +78,11 @@ class CollateFunc(object): class AudioDataset(Dataset): def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0): self._rng = rng if rng else np.random.RandomState(random_seed) - manifest = read_manifest(manifest_path) + manifest = [] + with jsonlines.open(manifest_path, 'r') as reader: + for json_data in reader: + manifest.append(json_data) + if num_samples == -1: sampled_manifest = manifest else: diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py index d423a604..948aba06 100644 --- a/paddlespeech/s2t/frontend/utility.py +++ b/paddlespeech/s2t/frontend/utility.py @@ -65,7 +65,26 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]: return char_list -def read_manifest( +def read_manifest(manifest_path,): + """Load and parse manifest file. + + Args: + manifest_path ([type]): Manifest file to load and parse. + + Raises: + IOError: If failed to parse the manifest. + + Returns: + List[dict]: Manifest parsing results. + """ + manifest = [] + with jsonlines.open(manifest_path, 'r') as reader: + for json_data in reader: + manifest.append(json_data) + return manifest + + +def read_manifest_filter( manifest_path, max_input_len=float('inf'), min_input_len=0.0, @@ -98,7 +117,6 @@ def read_manifest( Returns: List[dict]: Manifest parsing results. """ - manifest = [] with jsonlines.open(manifest_path, 'r') as reader: for json_data in reader: diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index 61eeb00f..006cfe04 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -95,7 +95,7 @@ class ManifestDataset(Dataset): super().__init__() # read manifest - self._manifest = read_manifest( + self._manifest = read_manifest_filter( manifest_path=manifest_path, max_input_len=max_input_len, min_input_len=min_input_len, diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py index f5fb2db0..be398814 100644 --- a/paddlespeech/s2t/training/trainer.py +++ b/paddlespeech/s2t/training/trainer.py @@ -19,7 +19,7 @@ from pathlib import Path import paddle from paddle import distributed as dist -from tensorboardX import SummaryWriter +from visualdl import LogWriter from paddlespeech.s2t.training.reporter import ObsScope from paddlespeech.s2t.training.reporter import report @@ -309,9 +309,8 @@ class Trainer(): logger.info( 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) if self.visualizer: - self.visualizer.add_scalars( - 'epoch', {'cv_loss': cv_loss, - 'lr': self.lr_scheduler()}, self.epoch) + self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch) + self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) # after epoch self.save(tag=self.epoch, infos={'val_loss': cv_loss}) @@ -427,7 +426,7 @@ class Trainer(): unexpected behaviors. """ # visualizer - visualizer = SummaryWriter(logdir=str(self.visual_dir)) + visualizer = LogWriter(logdir=str(self.visual_dir)) self.visualizer = visualizer @mp_tools.rank_zero_only diff --git a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py index d6b6eeb6..ba7ddde3 100644 --- a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py +++ b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py @@ -34,7 +34,7 @@ from speechtask.punctuation_restoration.model.lstm import RnnLm from speechtask.punctuation_restoration.utils import layer_tools from speechtask.punctuation_restoration.utils import mp_tools from speechtask.punctuation_restoration.utils.checkpoint import Checkpoint -from tensorboardX import SummaryWriter +from visualdl import LogWriter __all__ = ["Trainer", "Tester"] @@ -252,10 +252,8 @@ class Trainer(): self.logger.info("Epoch {} Val info val_loss {}, F1_score {}". format(self.epoch, total_loss, F1_score)) if self.visualizer: - self.visualizer.add_scalars("epoch", { - "total_loss": total_loss, - "lr": self.lr_scheduler() - }, self.epoch) + self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch) + self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) self.save( tag=self.epoch, infos={"val_loss": total_loss, @@ -341,7 +339,7 @@ class Trainer(): unexpected behaviors. """ # visualizer - visualizer = SummaryWriter(logdir=str(self.output_dir)) + visualizer = LogWriter(logdir=str(self.output_dir)) self.visualizer = visualizer @mp_tools.rank_zero_only diff --git a/requirements.txt b/requirements.txt index 99e485f8..2ee60d3f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,7 +40,6 @@ snakeviz soundfile~=0.10 sox soxbindings -tensorboardX textgrid timer tqdm diff --git a/utils/build_vocab.py b/utils/build_vocab.py index 6a903147..61dc5e25 100755 --- a/utils/build_vocab.py +++ b/utils/build_vocab.py @@ -19,11 +19,11 @@ import argparse import functools import os import tempfile +import jsonlines from collections import Counter from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import BLANK -from paddlespeech.s2t.frontend.utility import read_manifest from paddlespeech.s2t.frontend.utility import SOS from paddlespeech.s2t.frontend.utility import SPACE from paddlespeech.s2t.frontend.utility import UNK @@ -59,13 +59,21 @@ args = parser.parse_args() def count_manifest(counter, text_feature, manifest_path): - manifest_jsons = read_manifest(manifest_path) + manifest_jsons = [] + with jsonlines.open(manifest_path, 'r') as reader: + for json_data in reader: + manifest_jsons.append(json_data) + for line_json in manifest_jsons: line = text_feature.tokenize(line_json['text'], replace_space=False) counter.update(line) def dump_text_manifest(fileobj, manifest_path, key='text'): - manifest_jsons = read_manifest(manifest_path) + manifest_jsons = [] + with jsonlines.open(manifest_path, 'r') as reader: + for json_data in reader: + manifest_jsons.append(json_data) + for line_json in manifest_jsons: fileobj.write(line_json[key] + "\n") diff --git a/utils/utility.py b/utils/utility.py index b4db518a..29fda268 100755 --- a/utils/utility.py +++ b/utils/utility.py @@ -42,6 +42,7 @@ def read_manifest(manifest_path): for json_line in open(manifest_path, 'r'): try: json_data = json.loads(json_line) + manifest.append(json_data) except Exception as e: raise IOError("Error reading manifest: %s" % str(e)) return manifest From d395c2b8e34cede258b8d070271d2e8aa983ded5 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 30 Nov 2021 08:10:51 +0000 Subject: [PATCH 2/3] jsonlines reade manifest file --- .../frontend/augmentor/impulse_response.py | 5 ++-- .../s2t/frontend/augmentor/noise_perturb.py | 5 ++-- paddlespeech/s2t/frontend/normalizer.py | 26 +++---------------- paddlespeech/s2t/frontend/utility.py | 21 +-------------- paddlespeech/s2t/io/dataloader.py | 6 +++-- paddlespeech/s2t/io/dataset.py | 7 ++--- paddlespeech/s2t/utils/socket_server.py | 6 ++--- utils/dump_manifest.py | 8 +++--- utils/format_data.py | 6 +++-- utils/format_triplet_data.py | 5 ++-- utils/manifest_key_value.py | 5 ++-- utils/utility.py | 24 +---------------- 12 files changed, 37 insertions(+), 87 deletions(-) diff --git a/paddlespeech/s2t/frontend/augmentor/impulse_response.py b/paddlespeech/s2t/frontend/augmentor/impulse_response.py index 6cc9c0d4..1a82bb92 100644 --- a/paddlespeech/s2t/frontend/augmentor/impulse_response.py +++ b/paddlespeech/s2t/frontend/augmentor/impulse_response.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Contains the impulse response augmentation model.""" +import jsonlines from paddlespeech.s2t.frontend.audio import AudioSegment from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase -from paddlespeech.s2t.frontend.utility import read_manifest class ImpulseResponseAugmentor(AugmentorBase): @@ -28,7 +28,8 @@ class ImpulseResponseAugmentor(AugmentorBase): def __init__(self, rng, impulse_manifest_path): self._rng = rng - self._impulse_manifest = read_manifest(impulse_manifest_path) + with jsonlines.open(impulse_manifest_path, 'r') as reader: + self._impulse_manifest = list(reader) def __call__(self, x, uttid=None, train=True): if not train: diff --git a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py index 9d6da1a8..ce0a8818 100644 --- a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py +++ b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Contains the noise perturb augmentation model.""" +import jsonlines from paddlespeech.s2t.frontend.audio import AudioSegment from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase -from paddlespeech.s2t.frontend.utility import read_manifest class NoisePerturbAugmentor(AugmentorBase): @@ -34,7 +34,8 @@ class NoisePerturbAugmentor(AugmentorBase): self._min_snr_dB = min_snr_dB self._max_snr_dB = max_snr_dB self._rng = rng - self._noise_manifest = read_manifest(manifest_path=noise_manifest_path) + with jsonlines.open(noise_manifest_path, 'r') as reader: + self._noise_manifest = list(reader) def __call__(self, x, uttid=None, train=True): if not train: diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py index c55ec9a3..0a634fc1 100644 --- a/paddlespeech/s2t/frontend/normalizer.py +++ b/paddlespeech/s2t/frontend/normalizer.py @@ -13,10 +13,9 @@ # limitations under the License. """Contains feature normalizers.""" import json - +import jsonlines import numpy as np import paddle -import jsonlines from paddle.io import DataLoader from paddle.io import Dataset @@ -27,24 +26,6 @@ from paddlespeech.s2t.utils.log import Log __all__ = ["FeatureNormalizer"] logger = Log(__name__).getlog() - -def read_manifest(manifest_path): - """Load and parse manifest file. - - Args: - manifest_path ([type]): Manifest file to load and parse. - Raises: - IOError: If failed to parse the manifest. - - Returns: - List[dict]: Manifest parsing results. - """ - - manifest = [] - with jsonlines.open(manifest_path, 'r') as reader: - for json_data in reader: - manifest.append(json_data) - return manifest # https://github.com/PaddlePaddle/Paddle/pull/31481 class CollateFunc(object): @@ -78,10 +59,9 @@ class CollateFunc(object): class AudioDataset(Dataset): def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0): self._rng = rng if rng else np.random.RandomState(random_seed) - manifest = [] + with jsonlines.open(manifest_path, 'r') as reader: - for json_data in reader: - manifest.append(json_data) + manifest = list(reader) if num_samples == -1: sampled_manifest = manifest diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py index 948aba06..ccb767ad 100644 --- a/paddlespeech/s2t/frontend/utility.py +++ b/paddlespeech/s2t/frontend/utility.py @@ -64,27 +64,8 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]: char_list.append(MASKCTC) return char_list - -def read_manifest(manifest_path,): - """Load and parse manifest file. - - Args: - manifest_path ([type]): Manifest file to load and parse. - - Raises: - IOError: If failed to parse the manifest. - - Returns: - List[dict]: Manifest parsing results. - """ - manifest = [] - with jsonlines.open(manifest_path, 'r') as reader: - for json_data in reader: - manifest.append(json_data) - return manifest - -def read_manifest_filter( +def read_manifest( manifest_path, max_input_len=float('inf'), min_input_len=0.0, diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py index 3b5000a2..bda48842 100644 --- a/paddlespeech/s2t/io/dataloader.py +++ b/paddlespeech/s2t/io/dataloader.py @@ -15,11 +15,11 @@ from typing import Any from typing import Dict from typing import List from typing import Text +import jsonlines import numpy as np from paddle.io import DataLoader -from paddlespeech.s2t.frontend.utility import read_manifest from paddlespeech.s2t.io.batchfy import make_batchset from paddlespeech.s2t.io.converter import CustomConverter from paddlespeech.s2t.io.dataset import TransformDataset @@ -91,7 +91,9 @@ class BatchDataLoader(): self.n_iter_processes = n_iter_processes # read json data - self.data_json = read_manifest(json_file) + with jsonlines.open(json_file, 'r') as reader: + self.data_json = list(reader) + self.feat_dim, self.vocab_size = feat_dim_and_vocab_size( self.data_json, mode='asr') diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index 006cfe04..ba10aebb 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -14,7 +14,7 @@ # Modified from espnet(https://github.com/espnet/espnet) # Modified from wenet(https://github.com/wenet-e2e/wenet) from typing import Optional - +import jsonlines from paddle.io import Dataset from yacs.config import CfgNode @@ -95,7 +95,7 @@ class ManifestDataset(Dataset): super().__init__() # read manifest - self._manifest = read_manifest_filter( + self._manifest = read_manifest( manifest_path=manifest_path, max_input_len=max_input_len, min_input_len=min_input_len, @@ -184,7 +184,8 @@ class AudioDataset(Dataset): """ assert batch_type in ['static', 'dynamic'] # read manifest - data = read_manifest(data_file) + with jsonlines.open(data_file, 'r') as reader: + data = list(reader) if sort: data = sorted(data, key=lambda x: x["feat_shape"][0]) if raw_wav: diff --git a/paddlespeech/s2t/utils/socket_server.py b/paddlespeech/s2t/utils/socket_server.py index 43b56d72..6371ba85 100644 --- a/paddlespeech/s2t/utils/socket_server.py +++ b/paddlespeech/s2t/utils/socket_server.py @@ -20,8 +20,7 @@ import time import wave from time import gmtime from time import strftime - -from paddlespeech.s2t.frontend.utility import read_manifest +import jsonlines __all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"] @@ -44,7 +43,8 @@ def warm_up_test(audio_process_handler, num_test_cases, random_seed=0): """Warming-up test.""" - manifest = read_manifest(manifest_path) + with jsonlines.open(manifest_path) as reader: + manifest = list(reader) rng = random.Random(random_seed) samples = rng.sample(manifest, num_test_cases) for idx, sample in enumerate(samples): diff --git a/utils/dump_manifest.py b/utils/dump_manifest.py index b5f7b64a..d602571d 100755 --- a/utils/dump_manifest.py +++ b/utils/dump_manifest.py @@ -16,8 +16,7 @@ import argparse from pathlib import Path from typing import Union - -from paddlespeech.s2t.frontend.utility import read_manifest +import jsonlines key_whitelist = set(['feat', 'text', 'syllable', 'phone']) filename = { @@ -32,7 +31,10 @@ def dump_manifest(manifest_path, output_dir: Union[str, Path]): output_dir = Path(output_dir).expanduser() manifest_path = Path(manifest_path).expanduser() - manifest_jsons = read_manifest(manifest_path) + + with jsonlines.open(str(manifest_path), 'r') as reader: + manifest_jsons = list(reader) + first_line = manifest_jsons[0] file_map = {} diff --git a/utils/format_data.py b/utils/format_data.py index 2fa1924a..437d7e0f 100755 --- a/utils/format_data.py +++ b/utils/format_data.py @@ -15,11 +15,11 @@ """format manifest with more metadata.""" import argparse import functools +import jsonlines import json from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn -from paddlespeech.s2t.frontend.utility import read_manifest from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import print_arguments @@ -71,7 +71,9 @@ def main(): # } count = 0 for manifest_path in args.manifest_paths: - manifest_jsons = read_manifest(manifest_path) + with jsonlines.open(str(manifest_path), 'r') as reader: + manifest_jsons = list(reader) + for line_json in manifest_jsons: output_json = { "input": [], diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py index e0b5ece3..dd9dab42 100755 --- a/utils/format_triplet_data.py +++ b/utils/format_triplet_data.py @@ -16,10 +16,10 @@ import argparse import functools import json +import jsonlines from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn -from paddlespeech.s2t.frontend.utility import read_manifest from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import print_arguments @@ -63,7 +63,8 @@ def main(): count = 0 for manifest_path in args.manifest_paths: - manifest_jsons = read_manifest(manifest_path) + with jsonlines.open(str(manifest_path), 'r') as reader: + manifest_jsons = list(reader) for line_json in manifest_jsons: # text: translation text, text1: transcript text. # Currently only support joint-vocab, will add separate vocabs setting. diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py index b409236f..0cfb2450 100755 --- a/utils/manifest_key_value.py +++ b/utils/manifest_key_value.py @@ -3,10 +3,10 @@ import argparse import functools from pathlib import Path +import jsonlines from utils.utility import add_arguments from utils.utility import print_arguments -from utils.utility import read_manifest def main(args): @@ -19,7 +19,8 @@ def main(args): dur_scp = outdir / 'duration' text_scp = outdir / 'text' - manifest_jsons = read_manifest(args.manifest_path) + with jsonlines.open(args.manifest_path, 'r') as reader: + manifest_jsons = list(reader) with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open( 'w') as ftxt: diff --git a/utils/utility.py b/utils/utility.py index 29fda268..b3523b38 100755 --- a/utils/utility.py +++ b/utils/utility.py @@ -22,32 +22,10 @@ from typing import Text __all__ = [ "check_md5sum", "getfile_insensitive", "download_multi", "download", "unpack", "unzip", "md5file", "print_arguments", "add_arguments", - "read_manifest", "get_commandline_args" + "get_commandline_args" ] -def read_manifest(manifest_path): - """Load and parse manifest file. - Args: - manifest_path ([type]): Manifest file to load and parse. - - Raises: - IOError: If failed to parse the manifest. - - Returns: - List[dict]: Manifest parsing results. - """ - - manifest = [] - for json_line in open(manifest_path, 'r'): - try: - json_data = json.loads(json_line) - manifest.append(json_data) - except Exception as e: - raise IOError("Error reading manifest: %s" % str(e)) - return manifest - - def get_commandline_args(): extra_chars = [ " ", From 39228864bb1b4995de464d57b641ab43a247d9c7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 30 Nov 2021 08:18:13 +0000 Subject: [PATCH 3/3] format code --- examples/aishell/asr1/READEME.md | 3 --- paddlespeech/s2t/exps/u2/model.py | 10 ++++++---- paddlespeech/s2t/exps/u2_kaldi/model.py | 11 +++++++---- paddlespeech/s2t/exps/u2_st/model.py | 11 +++++++---- .../s2t/frontend/augmentor/impulse_response.py | 1 + paddlespeech/s2t/frontend/augmentor/noise_perturb.py | 1 + paddlespeech/s2t/frontend/normalizer.py | 6 ++++-- paddlespeech/s2t/frontend/utility.py | 2 +- paddlespeech/s2t/io/dataloader.py | 4 ++-- paddlespeech/s2t/io/dataset.py | 1 + paddlespeech/s2t/io/sampler.py | 2 +- paddlespeech/s2t/training/trainer.py | 6 ++++-- paddlespeech/s2t/utils/socket_server.py | 1 + .../punctuation_restoration/training/trainer.py | 6 ++++-- utils/build_vocab.py | 7 ++++--- utils/dump_manifest.py | 3 ++- utils/format_data.py | 5 +++-- utils/format_triplet_data.py | 1 + utils/manifest_key_value.py | 1 + utils/utility.py | 1 - 20 files changed, 51 insertions(+), 32 deletions(-) diff --git a/examples/aishell/asr1/READEME.md b/examples/aishell/asr1/READEME.md index e9fd3017..2eea233d 100644 --- a/examples/aishell/asr1/READEME.md +++ b/examples/aishell/asr1/READEME.md @@ -339,6 +339,3 @@ You need to prepare an audio file, please confirm the sample rate of the audio i ```bash CUDA_VISIBLE_DEVICES= ./local/test_hub.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20 data/test_audio.wav ``` - - - diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 5dbb72f4..d448021c 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -129,8 +129,8 @@ class U2Trainer(Trainer): losses_np_v = losses_np.copy() losses_np_v.update({"lr": self.lr_scheduler()}) for key, val in losses_np_v.items(): - self.visualizer.add_scalar(tag='train/'+key, value=val, step=self.iteration-1) - + self.visualizer.add_scalar( + tag='train/' + key, value=val, step=self.iteration - 1) @paddle.no_grad() def valid(self): @@ -238,8 +238,10 @@ class U2Trainer(Trainer): logger.info( 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) if self.visualizer: - self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch) - self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) + self.visualizer.add_scalar( + tag='eval/cv_loss', value=cv_loss, step=self.epoch) + self.visualizer.add_scalar( + tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) self.save(tag=self.epoch, infos={'val_loss': cv_loss}) self.new_epoch() diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index a3f45d8e..43e31a60 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -132,7 +132,8 @@ class U2Trainer(Trainer): losses_np_v = losses_np.copy() losses_np_v.update({"lr": self.lr_scheduler()}) for key, val in losses_np_v.items(): - self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1) + self.visualizer.add_scalar( + tag="train/" + key, value=val, step=self.iteration - 1) @paddle.no_grad() def valid(self): @@ -222,9 +223,11 @@ class U2Trainer(Trainer): logger.info( 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) if self.visualizer: - self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch) - self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) - + self.visualizer.add_scalar( + tag='eval/cv_loss', value=cv_loss, step=self.epoch) + self.visualizer.add_scalar( + tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) + self.save(tag=self.epoch, infos={'val_loss': cv_loss}) self.new_epoch() diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index 771203cf..2dbbdcd3 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -139,7 +139,8 @@ class U2STTrainer(Trainer): losses_np_v = losses_np.copy() losses_np_v.update({"lr": self.lr_scheduler()}) for key, val in losses_np_v.items(): - self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1) + self.visualizer.add_scalar( + tag="train/" + key, value=val, step=self.iteration - 1) @paddle.no_grad() def valid(self): @@ -235,9 +236,11 @@ class U2STTrainer(Trainer): logger.info( 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) if self.visualizer: - self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch) - self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) - + self.visualizer.add_scalar( + tag='eval/cv_loss', value=cv_loss, step=self.epoch) + self.visualizer.add_scalar( + tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) + self.save(tag=self.epoch, infos={'val_loss': cv_loss}) self.new_epoch() diff --git a/paddlespeech/s2t/frontend/augmentor/impulse_response.py b/paddlespeech/s2t/frontend/augmentor/impulse_response.py index 1a82bb92..5ba45bb2 100644 --- a/paddlespeech/s2t/frontend/augmentor/impulse_response.py +++ b/paddlespeech/s2t/frontend/augmentor/impulse_response.py @@ -13,6 +13,7 @@ # limitations under the License. """Contains the impulse response augmentation model.""" import jsonlines + from paddlespeech.s2t.frontend.audio import AudioSegment from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase diff --git a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py index ce0a8818..71165dac 100644 --- a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py +++ b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py @@ -13,6 +13,7 @@ # limitations under the License. """Contains the noise perturb augmentation model.""" import jsonlines + from paddlespeech.s2t.frontend.audio import AudioSegment from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py index 0a634fc1..017851e6 100644 --- a/paddlespeech/s2t/frontend/normalizer.py +++ b/paddlespeech/s2t/frontend/normalizer.py @@ -13,6 +13,7 @@ # limitations under the License. """Contains feature normalizers.""" import json + import jsonlines import numpy as np import paddle @@ -26,7 +27,8 @@ from paddlespeech.s2t.utils.log import Log __all__ = ["FeatureNormalizer"] logger = Log(__name__).getlog() - + + # https://github.com/PaddlePaddle/Paddle/pull/31481 class CollateFunc(object): def __init__(self, feature_func): @@ -62,7 +64,7 @@ class AudioDataset(Dataset): with jsonlines.open(manifest_path, 'r') as reader: manifest = list(reader) - + if num_samples == -1: sampled_manifest = manifest else: diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py index ccb767ad..175727e1 100644 --- a/paddlespeech/s2t/frontend/utility.py +++ b/paddlespeech/s2t/frontend/utility.py @@ -64,7 +64,7 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]: char_list.append(MASKCTC) return char_list - + def read_manifest( manifest_path, max_input_len=float('inf'), diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py index bda48842..b8eb3367 100644 --- a/paddlespeech/s2t/io/dataloader.py +++ b/paddlespeech/s2t/io/dataloader.py @@ -15,8 +15,8 @@ from typing import Any from typing import Dict from typing import List from typing import Text -import jsonlines +import jsonlines import numpy as np from paddle.io import DataLoader @@ -93,7 +93,7 @@ class BatchDataLoader(): # read json data with jsonlines.open(json_file, 'r') as reader: self.data_json = list(reader) - + self.feat_dim, self.vocab_size = feat_dim_and_vocab_size( self.data_json, mode='asr') diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index ba10aebb..d64d7d3e 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -14,6 +14,7 @@ # Modified from espnet(https://github.com/espnet/espnet) # Modified from wenet(https://github.com/wenet-e2e/wenet) from typing import Optional + import jsonlines from paddle.io import Dataset from yacs.config import CfgNode diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py index 0d5a16ce..35b57524 100644 --- a/paddlespeech/s2t/io/sampler.py +++ b/paddlespeech/s2t/io/sampler.py @@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False): """ rng = np.random.RandomState(epoch) shift_len = rng.randint(0, batch_size - 1) - batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size)) + batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) rng.shuffle(batch_indices) batch_indices = [item for batch in batch_indices for item in batch] assert clipped is False diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py index be398814..f0099f10 100644 --- a/paddlespeech/s2t/training/trainer.py +++ b/paddlespeech/s2t/training/trainer.py @@ -309,8 +309,10 @@ class Trainer(): logger.info( 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) if self.visualizer: - self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch) - self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) + self.visualizer.add_scalar( + tag='eval/cv_loss', value=cv_loss, step=self.epoch) + self.visualizer.add_scalar( + tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) # after epoch self.save(tag=self.epoch, infos={'val_loss': cv_loss}) diff --git a/paddlespeech/s2t/utils/socket_server.py b/paddlespeech/s2t/utils/socket_server.py index 6371ba85..691ea966 100644 --- a/paddlespeech/s2t/utils/socket_server.py +++ b/paddlespeech/s2t/utils/socket_server.py @@ -20,6 +20,7 @@ import time import wave from time import gmtime from time import strftime + import jsonlines __all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"] diff --git a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py index ba7ddde3..78512796 100644 --- a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py +++ b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py @@ -252,8 +252,10 @@ class Trainer(): self.logger.info("Epoch {} Val info val_loss {}, F1_score {}". format(self.epoch, total_loss, F1_score)) if self.visualizer: - self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch) - self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) + self.visualizer.add_scalar( + tag='eval/cv_loss', value=cv_loss, step=self.epoch) + self.visualizer.add_scalar( + tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) self.save( tag=self.epoch, infos={"val_loss": total_loss, diff --git a/utils/build_vocab.py b/utils/build_vocab.py index 61dc5e25..f832cbbc 100755 --- a/utils/build_vocab.py +++ b/utils/build_vocab.py @@ -19,9 +19,10 @@ import argparse import functools import os import tempfile -import jsonlines from collections import Counter +import jsonlines + from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import BLANK from paddlespeech.s2t.frontend.utility import SOS @@ -63,7 +64,7 @@ def count_manifest(counter, text_feature, manifest_path): with jsonlines.open(manifest_path, 'r') as reader: for json_data in reader: manifest_jsons.append(json_data) - + for line_json in manifest_jsons: line = text_feature.tokenize(line_json['text'], replace_space=False) counter.update(line) @@ -73,7 +74,7 @@ def dump_text_manifest(fileobj, manifest_path, key='text'): with jsonlines.open(manifest_path, 'r') as reader: for json_data in reader: manifest_jsons.append(json_data) - + for line_json in manifest_jsons: fileobj.write(line_json[key] + "\n") diff --git a/utils/dump_manifest.py b/utils/dump_manifest.py index d602571d..58d91755 100755 --- a/utils/dump_manifest.py +++ b/utils/dump_manifest.py @@ -16,6 +16,7 @@ import argparse from pathlib import Path from typing import Union + import jsonlines key_whitelist = set(['feat', 'text', 'syllable', 'phone']) @@ -34,7 +35,7 @@ def dump_manifest(manifest_path, output_dir: Union[str, Path]): with jsonlines.open(str(manifest_path), 'r') as reader: manifest_jsons = list(reader) - + first_line = manifest_jsons[0] file_map = {} diff --git a/utils/format_data.py b/utils/format_data.py index 437d7e0f..6db2a1bb 100755 --- a/utils/format_data.py +++ b/utils/format_data.py @@ -15,9 +15,10 @@ """format manifest with more metadata.""" import argparse import functools -import jsonlines import json +import jsonlines + from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.io.utility import feat_type @@ -73,7 +74,7 @@ def main(): for manifest_path in args.manifest_paths: with jsonlines.open(str(manifest_path), 'r') as reader: manifest_jsons = list(reader) - + for line_json in manifest_jsons: output_json = { "input": [], diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py index dd9dab42..44ff4527 100755 --- a/utils/format_triplet_data.py +++ b/utils/format_triplet_data.py @@ -16,6 +16,7 @@ import argparse import functools import json + import jsonlines from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py index 0cfb2450..fb3d3aaa 100755 --- a/utils/manifest_key_value.py +++ b/utils/manifest_key_value.py @@ -3,6 +3,7 @@ import argparse import functools from pathlib import Path + import jsonlines from utils.utility import add_arguments diff --git a/utils/utility.py b/utils/utility.py index b3523b38..dbf8b1d7 100755 --- a/utils/utility.py +++ b/utils/utility.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import hashlib -import json import os import sys import tarfile