From 7554b6107aa19d29195e8dc908c8bc89e208cdc3 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 30 Nov 2021 08:00:22 +0000
Subject: [PATCH 1/3] using visualdl; fix read_manifest

---
 paddlespeech/s2t/exps/u2/model.py             | 10 +++----
 paddlespeech/s2t/exps/u2_kaldi/model.py       | 10 +++----
 paddlespeech/s2t/exps/u2_st/model.py          | 10 +++----
 paddlespeech/s2t/frontend/normalizer.py       | 27 ++++++++++++++++---
 paddlespeech/s2t/frontend/utility.py          | 22 +++++++++++++--
 paddlespeech/s2t/io/dataset.py                |  2 +-
 paddlespeech/s2t/training/trainer.py          |  9 +++----
 .../training/trainer.py                       | 10 +++----
 requirements.txt                              |  1 -
 utils/build_vocab.py                          | 14 +++++++---
 utils/utility.py                              |  1 +
 11 files changed, 80 insertions(+), 36 deletions(-)

diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index b6dbcf44..5dbb72f4 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -128,8 +128,9 @@ class U2Trainer(Trainer):
             if dist.get_rank() == 0 and self.visualizer:
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
-                self.visualizer.add_scalars("step", losses_np_v,
-                                            self.iteration - 1)
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(tag='train/'+key, value=val, step=self.iteration-1)
+
 
     @paddle.no_grad()
     def valid(self):
@@ -237,9 +238,8 @@ class U2Trainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalars(
-                    'epoch', {'cv_loss': cv_loss,
-                              'lr': self.lr_scheduler()}, self.epoch)
+                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py
index c23b4c24..a3f45d8e 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -131,8 +131,8 @@ class U2Trainer(Trainer):
             if dist.get_rank() == 0 and self.visualizer:
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
-                self.visualizer.add_scalars("step", losses_np_v,
-                                            self.iteration - 1)
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1)
 
     @paddle.no_grad()
     def valid(self):
@@ -222,9 +222,9 @@ class U2Trainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalars(
-                    'epoch', {'cv_loss': cv_loss,
-                              'lr': self.lr_scheduler()}, self.epoch)
+                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+                
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
 
diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
index 034463fe..771203cf 100644
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -138,8 +138,8 @@ class U2STTrainer(Trainer):
             if dist.get_rank() == 0 and self.visualizer:
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
-                self.visualizer.add_scalars("step", losses_np_v,
-                                            self.iteration - 1)
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1)
 
     @paddle.no_grad()
     def valid(self):
@@ -235,9 +235,9 @@ class U2STTrainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalars(
-                    'epoch', {'cv_loss': cv_loss,
-                              'lr': self.lr_scheduler()}, self.epoch)
+                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+                
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
 
diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py
index a29cddc3..c55ec9a3 100644
--- a/paddlespeech/s2t/frontend/normalizer.py
+++ b/paddlespeech/s2t/frontend/normalizer.py
@@ -16,19 +16,36 @@ import json
 
 import numpy as np
 import paddle
+import jsonlines
 from paddle.io import DataLoader
 from paddle.io import Dataset
 
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.utils.log import Log
 
 __all__ = ["FeatureNormalizer"]
 
 logger = Log(__name__).getlog()
 
-
+def read_manifest(manifest_path):
+     """Load and parse manifest file.
+ 
+     Args:
+         manifest_path ([type]): Manifest file to load and parse.
+     Raises:
+         IOError: If failed to parse the manifest.
+ 
+     Returns:
+         List[dict]: Manifest parsing results.
+     """
+ 
+     manifest = []
+     with jsonlines.open(manifest_path, 'r') as reader:
+         for json_data in reader:
+            manifest.append(json_data)
+     return manifest
+ 
 # https://github.com/PaddlePaddle/Paddle/pull/31481
 class CollateFunc(object):
     def __init__(self, feature_func):
@@ -61,7 +78,11 @@ class CollateFunc(object):
 class AudioDataset(Dataset):
     def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
         self._rng = rng if rng else np.random.RandomState(random_seed)
-        manifest = read_manifest(manifest_path)
+        manifest = []
+        with jsonlines.open(manifest_path, 'r') as reader:
+         for json_data in reader:
+            manifest.append(json_data)
+        
         if num_samples == -1:
             sampled_manifest = manifest
         else:
diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
index d423a604..948aba06 100644
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -65,7 +65,26 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
     return char_list
 
 
-def read_manifest(
+def read_manifest(manifest_path,):
+    """Load and parse manifest file.
+
+    Args:
+        manifest_path ([type]): Manifest file to load and parse.
+
+    Raises:
+        IOError: If failed to parse the manifest.
+
+    Returns:
+        List[dict]: Manifest parsing results.
+    """
+    manifest = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest.append(json_data)
+    return manifest
+
+    
+def read_manifest_filter(
         manifest_path,
         max_input_len=float('inf'),
         min_input_len=0.0,
@@ -98,7 +117,6 @@ def read_manifest(
     Returns:
         List[dict]: Manifest parsing results.
     """
-
     manifest = []
     with jsonlines.open(manifest_path, 'r') as reader:
         for json_data in reader:
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index 61eeb00f..006cfe04 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -95,7 +95,7 @@ class ManifestDataset(Dataset):
         super().__init__()
 
         # read manifest
-        self._manifest = read_manifest(
+        self._manifest = read_manifest_filter(
             manifest_path=manifest_path,
             max_input_len=max_input_len,
             min_input_len=min_input_len,
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index f5fb2db0..be398814 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -19,7 +19,7 @@ from pathlib import Path
 
 import paddle
 from paddle import distributed as dist
-from tensorboardX import SummaryWriter
+from visualdl import LogWriter
 
 from paddlespeech.s2t.training.reporter import ObsScope
 from paddlespeech.s2t.training.reporter import report
@@ -309,9 +309,8 @@ class Trainer():
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalars(
-                    'epoch', {'cv_loss': cv_loss,
-                              'lr': self.lr_scheduler()}, self.epoch)
+                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             # after epoch
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
@@ -427,7 +426,7 @@ class Trainer():
         unexpected behaviors.
         """
         # visualizer
-        visualizer = SummaryWriter(logdir=str(self.visual_dir))
+        visualizer = LogWriter(logdir=str(self.visual_dir))
         self.visualizer = visualizer
 
     @mp_tools.rank_zero_only
diff --git a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
index d6b6eeb6..ba7ddde3 100644
--- a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
@@ -34,7 +34,7 @@ from speechtask.punctuation_restoration.model.lstm import RnnLm
 from speechtask.punctuation_restoration.utils import layer_tools
 from speechtask.punctuation_restoration.utils import mp_tools
 from speechtask.punctuation_restoration.utils.checkpoint import Checkpoint
-from tensorboardX import SummaryWriter
+from visualdl import LogWriter
 
 __all__ = ["Trainer", "Tester"]
 
@@ -252,10 +252,8 @@ class Trainer():
             self.logger.info("Epoch {} Val info val_loss {}, F1_score {}".
                              format(self.epoch, total_loss, F1_score))
             if self.visualizer:
-                self.visualizer.add_scalars("epoch", {
-                    "total_loss": total_loss,
-                    "lr": self.lr_scheduler()
-                }, self.epoch)
+                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             self.save(
                 tag=self.epoch, infos={"val_loss": total_loss,
@@ -341,7 +339,7 @@ class Trainer():
         unexpected behaviors.
         """
         # visualizer
-        visualizer = SummaryWriter(logdir=str(self.output_dir))
+        visualizer = LogWriter(logdir=str(self.output_dir))
         self.visualizer = visualizer
 
     @mp_tools.rank_zero_only
diff --git a/requirements.txt b/requirements.txt
index 99e485f8..2ee60d3f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -40,7 +40,6 @@ snakeviz
 soundfile~=0.10
 sox
 soxbindings
-tensorboardX
 textgrid
 timer
 tqdm
diff --git a/utils/build_vocab.py b/utils/build_vocab.py
index 6a903147..61dc5e25 100755
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@@ -19,11 +19,11 @@ import argparse
 import functools
 import os
 import tempfile
+import jsonlines
 from collections import Counter
 
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import BLANK
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.frontend.utility import SOS
 from paddlespeech.s2t.frontend.utility import SPACE
 from paddlespeech.s2t.frontend.utility import UNK
@@ -59,13 +59,21 @@ args = parser.parse_args()
 
 
 def count_manifest(counter, text_feature, manifest_path):
-    manifest_jsons = read_manifest(manifest_path)
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+        
     for line_json in manifest_jsons:
         line = text_feature.tokenize(line_json['text'], replace_space=False)
         counter.update(line)
 
 def dump_text_manifest(fileobj, manifest_path, key='text'):
-    manifest_jsons = read_manifest(manifest_path)
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+            
     for line_json in manifest_jsons:
         fileobj.write(line_json[key] + "\n")
 
diff --git a/utils/utility.py b/utils/utility.py
index b4db518a..29fda268 100755
--- a/utils/utility.py
+++ b/utils/utility.py
@@ -42,6 +42,7 @@ def read_manifest(manifest_path):
     for json_line in open(manifest_path, 'r'):
         try:
             json_data = json.loads(json_line)
+            manifest.append(json_data)
         except Exception as e:
             raise IOError("Error reading manifest: %s" % str(e))
     return manifest

From d395c2b8e34cede258b8d070271d2e8aa983ded5 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 30 Nov 2021 08:10:51 +0000
Subject: [PATCH 2/3] jsonlines reade manifest file

---
 .../frontend/augmentor/impulse_response.py    |  5 ++--
 .../s2t/frontend/augmentor/noise_perturb.py   |  5 ++--
 paddlespeech/s2t/frontend/normalizer.py       | 26 +++----------------
 paddlespeech/s2t/frontend/utility.py          | 21 +--------------
 paddlespeech/s2t/io/dataloader.py             |  6 +++--
 paddlespeech/s2t/io/dataset.py                |  7 ++---
 paddlespeech/s2t/utils/socket_server.py       |  6 ++---
 utils/dump_manifest.py                        |  8 +++---
 utils/format_data.py                          |  6 +++--
 utils/format_triplet_data.py                  |  5 ++--
 utils/manifest_key_value.py                   |  5 ++--
 utils/utility.py                              | 24 +----------------
 12 files changed, 37 insertions(+), 87 deletions(-)

diff --git a/paddlespeech/s2t/frontend/augmentor/impulse_response.py b/paddlespeech/s2t/frontend/augmentor/impulse_response.py
index 6cc9c0d4..1a82bb92 100644
--- a/paddlespeech/s2t/frontend/augmentor/impulse_response.py
+++ b/paddlespeech/s2t/frontend/augmentor/impulse_response.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the impulse response augmentation model."""
+import jsonlines
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
-from paddlespeech.s2t.frontend.utility import read_manifest
 
 
 class ImpulseResponseAugmentor(AugmentorBase):
@@ -28,7 +28,8 @@ class ImpulseResponseAugmentor(AugmentorBase):
 
     def __init__(self, rng, impulse_manifest_path):
         self._rng = rng
-        self._impulse_manifest = read_manifest(impulse_manifest_path)
+        with jsonlines.open(impulse_manifest_path, 'r') as reader:
+            self._impulse_manifest = list(reader)
 
     def __call__(self, x, uttid=None, train=True):
         if not train:
diff --git a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
index 9d6da1a8..ce0a8818 100644
--- a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
+++ b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the noise perturb augmentation model."""
+import jsonlines
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
-from paddlespeech.s2t.frontend.utility import read_manifest
 
 
 class NoisePerturbAugmentor(AugmentorBase):
@@ -34,7 +34,8 @@ class NoisePerturbAugmentor(AugmentorBase):
         self._min_snr_dB = min_snr_dB
         self._max_snr_dB = max_snr_dB
         self._rng = rng
-        self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
+        with jsonlines.open(noise_manifest_path, 'r') as reader:
+            self._noise_manifest = list(reader)
 
     def __call__(self, x, uttid=None, train=True):
         if not train:
diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py
index c55ec9a3..0a634fc1 100644
--- a/paddlespeech/s2t/frontend/normalizer.py
+++ b/paddlespeech/s2t/frontend/normalizer.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 """Contains feature normalizers."""
 import json
-
+import jsonlines
 import numpy as np
 import paddle
-import jsonlines
 from paddle.io import DataLoader
 from paddle.io import Dataset
 
@@ -27,24 +26,6 @@ from paddlespeech.s2t.utils.log import Log
 __all__ = ["FeatureNormalizer"]
 
 logger = Log(__name__).getlog()
-
-def read_manifest(manifest_path):
-     """Load and parse manifest file.
- 
-     Args:
-         manifest_path ([type]): Manifest file to load and parse.
-     Raises:
-         IOError: If failed to parse the manifest.
- 
-     Returns:
-         List[dict]: Manifest parsing results.
-     """
- 
-     manifest = []
-     with jsonlines.open(manifest_path, 'r') as reader:
-         for json_data in reader:
-            manifest.append(json_data)
-     return manifest
  
 # https://github.com/PaddlePaddle/Paddle/pull/31481
 class CollateFunc(object):
@@ -78,10 +59,9 @@ class CollateFunc(object):
 class AudioDataset(Dataset):
     def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
         self._rng = rng if rng else np.random.RandomState(random_seed)
-        manifest = []
+
         with jsonlines.open(manifest_path, 'r') as reader:
-         for json_data in reader:
-            manifest.append(json_data)
+            manifest = list(reader)
         
         if num_samples == -1:
             sampled_manifest = manifest
diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
index 948aba06..ccb767ad 100644
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -64,27 +64,8 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
         char_list.append(MASKCTC)
     return char_list
 
-
-def read_manifest(manifest_path,):
-    """Load and parse manifest file.
-
-    Args:
-        manifest_path ([type]): Manifest file to load and parse.
-
-    Raises:
-        IOError: If failed to parse the manifest.
-
-    Returns:
-        List[dict]: Manifest parsing results.
-    """
-    manifest = []
-    with jsonlines.open(manifest_path, 'r') as reader:
-        for json_data in reader:
-            manifest.append(json_data)
-    return manifest
-
     
-def read_manifest_filter(
+def read_manifest(
         manifest_path,
         max_input_len=float('inf'),
         min_input_len=0.0,
diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py
index 3b5000a2..bda48842 100644
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@@ -15,11 +15,11 @@ from typing import Any
 from typing import Dict
 from typing import List
 from typing import Text
+import jsonlines
 
 import numpy as np
 from paddle.io import DataLoader
 
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.io.batchfy import make_batchset
 from paddlespeech.s2t.io.converter import CustomConverter
 from paddlespeech.s2t.io.dataset import TransformDataset
@@ -91,7 +91,9 @@ class BatchDataLoader():
         self.n_iter_processes = n_iter_processes
 
         # read json data
-        self.data_json = read_manifest(json_file)
+        with jsonlines.open(json_file, 'r') as reader:
+            self.data_json = list(reader)
+            
         self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
             self.data_json, mode='asr')
 
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index 006cfe04..ba10aebb 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -14,7 +14,7 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 from typing import Optional
-
+import jsonlines
 from paddle.io import Dataset
 from yacs.config import CfgNode
 
@@ -95,7 +95,7 @@ class ManifestDataset(Dataset):
         super().__init__()
 
         # read manifest
-        self._manifest = read_manifest_filter(
+        self._manifest = read_manifest(
             manifest_path=manifest_path,
             max_input_len=max_input_len,
             min_input_len=min_input_len,
@@ -184,7 +184,8 @@ class AudioDataset(Dataset):
         """
         assert batch_type in ['static', 'dynamic']
         # read manifest
-        data = read_manifest(data_file)
+        with jsonlines.open(data_file, 'r') as reader:
+            data = list(reader)
         if sort:
             data = sorted(data, key=lambda x: x["feat_shape"][0])
         if raw_wav:
diff --git a/paddlespeech/s2t/utils/socket_server.py b/paddlespeech/s2t/utils/socket_server.py
index 43b56d72..6371ba85 100644
--- a/paddlespeech/s2t/utils/socket_server.py
+++ b/paddlespeech/s2t/utils/socket_server.py
@@ -20,8 +20,7 @@ import time
 import wave
 from time import gmtime
 from time import strftime
-
-from paddlespeech.s2t.frontend.utility import read_manifest
+import jsonlines
 
 __all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"]
 
@@ -44,7 +43,8 @@ def warm_up_test(audio_process_handler,
                  num_test_cases,
                  random_seed=0):
     """Warming-up test."""
-    manifest = read_manifest(manifest_path)
+    with jsonlines.open(manifest_path) as reader:
+        manifest = list(reader)
     rng = random.Random(random_seed)
     samples = rng.sample(manifest, num_test_cases)
     for idx, sample in enumerate(samples):
diff --git a/utils/dump_manifest.py b/utils/dump_manifest.py
index b5f7b64a..d602571d 100755
--- a/utils/dump_manifest.py
+++ b/utils/dump_manifest.py
@@ -16,8 +16,7 @@
 import argparse
 from pathlib import Path
 from typing import Union
-
-from paddlespeech.s2t.frontend.utility import read_manifest
+import jsonlines
 
 key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
 filename = {
@@ -32,7 +31,10 @@ def dump_manifest(manifest_path, output_dir: Union[str, Path]):
 
     output_dir = Path(output_dir).expanduser()
     manifest_path = Path(manifest_path).expanduser()
-    manifest_jsons = read_manifest(manifest_path)
+
+    with jsonlines.open(str(manifest_path), 'r') as reader:
+        manifest_jsons = list(reader)
+        
     first_line = manifest_jsons[0]
     file_map = {}
 
diff --git a/utils/format_data.py b/utils/format_data.py
index 2fa1924a..437d7e0f 100755
--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -15,11 +15,11 @@
 """format manifest with more metadata."""
 import argparse
 import functools
+import jsonlines
 import json
 
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.io.utility import feat_type
 from paddlespeech.s2t.utils.utility import add_arguments
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -71,7 +71,9 @@ def main():
     # }
     count = 0
     for manifest_path in args.manifest_paths:
-        manifest_jsons = read_manifest(manifest_path)
+        with jsonlines.open(str(manifest_path), 'r') as reader:
+            manifest_jsons = list(reader)
+        
         for line_json in manifest_jsons:
             output_json = {
                 "input": [],
diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py
index e0b5ece3..dd9dab42 100755
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
@@ -16,10 +16,10 @@
 import argparse
 import functools
 import json
+import jsonlines
 
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.io.utility import feat_type
 from paddlespeech.s2t.utils.utility import add_arguments
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -63,7 +63,8 @@ def main():
 
     count = 0
     for manifest_path in args.manifest_paths:
-        manifest_jsons = read_manifest(manifest_path)
+        with jsonlines.open(str(manifest_path), 'r') as reader:
+            manifest_jsons = list(reader)
         for line_json in manifest_jsons:
             # text: translation text, text1: transcript text.
             # Currently only support joint-vocab, will add separate vocabs setting.
diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py
index b409236f..0cfb2450 100755
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@@ -3,10 +3,10 @@
 import argparse
 import functools
 from pathlib import Path
+import jsonlines
 
 from utils.utility import add_arguments
 from utils.utility import print_arguments
-from utils.utility import read_manifest
 
 
 def main(args):
@@ -19,7 +19,8 @@ def main(args):
     dur_scp = outdir / 'duration'
     text_scp = outdir / 'text'
 
-    manifest_jsons = read_manifest(args.manifest_path)
+    with jsonlines.open(args.manifest_path, 'r') as reader:
+        manifest_jsons = list(reader)
 
     with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
             'w') as ftxt:
diff --git a/utils/utility.py b/utils/utility.py
index 29fda268..b3523b38 100755
--- a/utils/utility.py
+++ b/utils/utility.py
@@ -22,32 +22,10 @@ from typing import Text
 __all__ = [
     "check_md5sum", "getfile_insensitive", "download_multi", "download",
     "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
-    "read_manifest", "get_commandline_args"
+    "get_commandline_args"
 ]
 
 
-def read_manifest(manifest_path):
-    """Load and parse manifest file.
-    Args:
-        manifest_path ([type]): Manifest file to load and parse.
-
-    Raises:
-        IOError: If failed to parse the manifest.
-
-    Returns:
-        List[dict]: Manifest parsing results.
-    """
-
-    manifest = []
-    for json_line in open(manifest_path, 'r'):
-        try:
-            json_data = json.loads(json_line)
-            manifest.append(json_data)
-        except Exception as e:
-            raise IOError("Error reading manifest: %s" % str(e))
-    return manifest
-
-
 def get_commandline_args():
     extra_chars = [
         " ",

From 39228864bb1b4995de464d57b641ab43a247d9c7 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 30 Nov 2021 08:18:13 +0000
Subject: [PATCH 3/3] format code

---
 examples/aishell/asr1/READEME.md                      |  3 ---
 paddlespeech/s2t/exps/u2/model.py                     | 10 ++++++----
 paddlespeech/s2t/exps/u2_kaldi/model.py               | 11 +++++++----
 paddlespeech/s2t/exps/u2_st/model.py                  | 11 +++++++----
 .../s2t/frontend/augmentor/impulse_response.py        |  1 +
 paddlespeech/s2t/frontend/augmentor/noise_perturb.py  |  1 +
 paddlespeech/s2t/frontend/normalizer.py               |  6 ++++--
 paddlespeech/s2t/frontend/utility.py                  |  2 +-
 paddlespeech/s2t/io/dataloader.py                     |  4 ++--
 paddlespeech/s2t/io/dataset.py                        |  1 +
 paddlespeech/s2t/io/sampler.py                        |  2 +-
 paddlespeech/s2t/training/trainer.py                  |  6 ++++--
 paddlespeech/s2t/utils/socket_server.py               |  1 +
 .../punctuation_restoration/training/trainer.py       |  6 ++++--
 utils/build_vocab.py                                  |  7 ++++---
 utils/dump_manifest.py                                |  3 ++-
 utils/format_data.py                                  |  5 +++--
 utils/format_triplet_data.py                          |  1 +
 utils/manifest_key_value.py                           |  1 +
 utils/utility.py                                      |  1 -
 20 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/examples/aishell/asr1/READEME.md b/examples/aishell/asr1/READEME.md
index e9fd3017..2eea233d 100644
--- a/examples/aishell/asr1/READEME.md
+++ b/examples/aishell/asr1/READEME.md
@@ -339,6 +339,3 @@ You need to prepare an audio file, please confirm the sample rate of the audio i
 ```bash
 CUDA_VISIBLE_DEVICES= ./local/test_hub.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20 data/test_audio.wav
 ```
-
-
-
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index 5dbb72f4..d448021c 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -129,8 +129,8 @@ class U2Trainer(Trainer):
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
                 for key, val in losses_np_v.items():
-                    self.visualizer.add_scalar(tag='train/'+key, value=val, step=self.iteration-1)
-
+                    self.visualizer.add_scalar(
+                        tag='train/' + key, value=val, step=self.iteration - 1)
 
     @paddle.no_grad()
     def valid(self):
@@ -238,8 +238,10 @@ class U2Trainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
-                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py
index a3f45d8e..43e31a60 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -132,7 +132,8 @@ class U2Trainer(Trainer):
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
                 for key, val in losses_np_v.items():
-                    self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1)
+                    self.visualizer.add_scalar(
+                        tag="train/" + key, value=val, step=self.iteration - 1)
 
     @paddle.no_grad()
     def valid(self):
@@ -222,9 +223,11 @@ class U2Trainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
-                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
-                
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
 
diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
index 771203cf..2dbbdcd3 100644
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -139,7 +139,8 @@ class U2STTrainer(Trainer):
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
                 for key, val in losses_np_v.items():
-                    self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1)
+                    self.visualizer.add_scalar(
+                        tag="train/" + key, value=val, step=self.iteration - 1)
 
     @paddle.no_grad()
     def valid(self):
@@ -235,9 +236,11 @@ class U2STTrainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
-                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
-                
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
 
diff --git a/paddlespeech/s2t/frontend/augmentor/impulse_response.py b/paddlespeech/s2t/frontend/augmentor/impulse_response.py
index 1a82bb92..5ba45bb2 100644
--- a/paddlespeech/s2t/frontend/augmentor/impulse_response.py
+++ b/paddlespeech/s2t/frontend/augmentor/impulse_response.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Contains the impulse response augmentation model."""
 import jsonlines
+
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
 
diff --git a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
index ce0a8818..71165dac 100644
--- a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
+++ b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Contains the noise perturb augmentation model."""
 import jsonlines
+
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
 
diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py
index 0a634fc1..017851e6 100644
--- a/paddlespeech/s2t/frontend/normalizer.py
+++ b/paddlespeech/s2t/frontend/normalizer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Contains feature normalizers."""
 import json
+
 import jsonlines
 import numpy as np
 import paddle
@@ -26,7 +27,8 @@ from paddlespeech.s2t.utils.log import Log
 __all__ = ["FeatureNormalizer"]
 
 logger = Log(__name__).getlog()
- 
+
+
 # https://github.com/PaddlePaddle/Paddle/pull/31481
 class CollateFunc(object):
     def __init__(self, feature_func):
@@ -62,7 +64,7 @@ class AudioDataset(Dataset):
 
         with jsonlines.open(manifest_path, 'r') as reader:
             manifest = list(reader)
-        
+
         if num_samples == -1:
             sampled_manifest = manifest
         else:
diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
index ccb767ad..175727e1 100644
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -64,7 +64,7 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
         char_list.append(MASKCTC)
     return char_list
 
-    
+
 def read_manifest(
         manifest_path,
         max_input_len=float('inf'),
diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py
index bda48842..b8eb3367 100644
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@@ -15,8 +15,8 @@ from typing import Any
 from typing import Dict
 from typing import List
 from typing import Text
-import jsonlines
 
+import jsonlines
 import numpy as np
 from paddle.io import DataLoader
 
@@ -93,7 +93,7 @@ class BatchDataLoader():
         # read json data
         with jsonlines.open(json_file, 'r') as reader:
             self.data_json = list(reader)
-            
+
         self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
             self.data_json, mode='asr')
 
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index ba10aebb..d64d7d3e 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -14,6 +14,7 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 from typing import Optional
+
 import jsonlines
 from paddle.io import Dataset
 from yacs.config import CfgNode
diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py
index 0d5a16ce..35b57524 100644
--- a/paddlespeech/s2t/io/sampler.py
+++ b/paddlespeech/s2t/io/sampler.py
@@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False):
     """
     rng = np.random.RandomState(epoch)
     shift_len = rng.randint(0, batch_size - 1)
-    batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size))
+    batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
     rng.shuffle(batch_indices)
     batch_indices = [item for batch in batch_indices for item in batch]
     assert clipped is False
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index be398814..f0099f10 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -309,8 +309,10 @@ class Trainer():
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
-                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             # after epoch
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
diff --git a/paddlespeech/s2t/utils/socket_server.py b/paddlespeech/s2t/utils/socket_server.py
index 6371ba85..691ea966 100644
--- a/paddlespeech/s2t/utils/socket_server.py
+++ b/paddlespeech/s2t/utils/socket_server.py
@@ -20,6 +20,7 @@ import time
 import wave
 from time import gmtime
 from time import strftime
+
 import jsonlines
 
 __all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"]
diff --git a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
index ba7ddde3..78512796 100644
--- a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
@@ -252,8 +252,10 @@ class Trainer():
             self.logger.info("Epoch {} Val info val_loss {}, F1_score {}".
                              format(self.epoch, total_loss, F1_score))
             if self.visualizer:
-                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
-                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             self.save(
                 tag=self.epoch, infos={"val_loss": total_loss,
diff --git a/utils/build_vocab.py b/utils/build_vocab.py
index 61dc5e25..f832cbbc 100755
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@@ -19,9 +19,10 @@ import argparse
 import functools
 import os
 import tempfile
-import jsonlines
 from collections import Counter
 
+import jsonlines
+
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import BLANK
 from paddlespeech.s2t.frontend.utility import SOS
@@ -63,7 +64,7 @@ def count_manifest(counter, text_feature, manifest_path):
     with jsonlines.open(manifest_path, 'r') as reader:
         for json_data in reader:
             manifest_jsons.append(json_data)
-        
+
     for line_json in manifest_jsons:
         line = text_feature.tokenize(line_json['text'], replace_space=False)
         counter.update(line)
@@ -73,7 +74,7 @@ def dump_text_manifest(fileobj, manifest_path, key='text'):
     with jsonlines.open(manifest_path, 'r') as reader:
         for json_data in reader:
             manifest_jsons.append(json_data)
-            
+
     for line_json in manifest_jsons:
         fileobj.write(line_json[key] + "\n")
 
diff --git a/utils/dump_manifest.py b/utils/dump_manifest.py
index d602571d..58d91755 100755
--- a/utils/dump_manifest.py
+++ b/utils/dump_manifest.py
@@ -16,6 +16,7 @@
 import argparse
 from pathlib import Path
 from typing import Union
+
 import jsonlines
 
 key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
@@ -34,7 +35,7 @@ def dump_manifest(manifest_path, output_dir: Union[str, Path]):
 
     with jsonlines.open(str(manifest_path), 'r') as reader:
         manifest_jsons = list(reader)
-        
+
     first_line = manifest_jsons[0]
     file_map = {}
 
diff --git a/utils/format_data.py b/utils/format_data.py
index 437d7e0f..6db2a1bb 100755
--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -15,9 +15,10 @@
 """format manifest with more metadata."""
 import argparse
 import functools
-import jsonlines
 import json
 
+import jsonlines
+
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
 from paddlespeech.s2t.io.utility import feat_type
@@ -73,7 +74,7 @@ def main():
     for manifest_path in args.manifest_paths:
         with jsonlines.open(str(manifest_path), 'r') as reader:
             manifest_jsons = list(reader)
-        
+
         for line_json in manifest_jsons:
             output_json = {
                 "input": [],
diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py
index dd9dab42..44ff4527 100755
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
@@ -16,6 +16,7 @@
 import argparse
 import functools
 import json
+
 import jsonlines
 
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py
index 0cfb2450..fb3d3aaa 100755
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@@ -3,6 +3,7 @@
 import argparse
 import functools
 from pathlib import Path
+
 import jsonlines
 
 from utils.utility import add_arguments
diff --git a/utils/utility.py b/utils/utility.py
index b3523b38..dbf8b1d7 100755
--- a/utils/utility.py
+++ b/utils/utility.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import hashlib
-import json
 import os
 import sys
 import tarfile