From 279348d7860cc3ba45a80c86f3d2c9194972db53 Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Tue, 8 Jun 2021 10:32:05 +0000
Subject: [PATCH 01/14] move process utt to collator

---
 deepspeech/exps/deepspeech2/model.py   |   2 +-
 deepspeech/io/collator.py              | 117 ++++++++++++++++++++++++-
 deepspeech/io/dataset.py               |  82 +----------------
 examples/tiny/s0/conf/deepspeech2.yaml |   4 +-
 4 files changed, 120 insertions(+), 85 deletions(-)

diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index 468bc652..50ff3c17 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -165,7 +165,7 @@ class DeepSpeech2Trainer(Trainer):
                 sortagrad=config.data.sortagrad,
                 shuffle_method=config.data.shuffle_method)
 
-        collate_fn = SpeechCollator(keep_transcription_text=False)
+        collate_fn = SpeechCollator(config, keep_transcription_text=False)
         self.train_loader = DataLoader(
             train_dataset,
             batch_sampler=batch_sampler,
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index 3bec9875..d725b0b1 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -16,14 +16,22 @@ import numpy as np
 from deepspeech.frontend.utility import IGNORE_ID
 from deepspeech.io.utility import pad_sequence
 from deepspeech.utils.log import Log
+from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
+from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
+from deepspeech.frontend.normalizer import FeatureNormalizer
+from deepspeech.frontend.speech import SpeechSegment
+import io
+import time
 
 __all__ = ["SpeechCollator"]
 
 logger = Log(__name__).getlog()
 
+# namedtupe need global for pickle.
+TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
 
 class SpeechCollator():
-    def __init__(self, keep_transcription_text=True):
+    def __init__(self, config, keep_transcription_text=True):
         """
         Padding audio features with zeros to make them have the same shape (or
         a user-defined shape) within one bach.
@@ -32,6 +40,112 @@ class SpeechCollator():
         """
         self._keep_transcription_text = keep_transcription_text
 
+        if isinstance(config.data.augmentation_config, (str, bytes)):
+            if config.data.augmentation_config:
+                aug_file = io.open(
+                    config.data.augmentation_config, mode='r', encoding='utf8')
+            else:
+                aug_file = io.StringIO(initial_value='{}', newline='')
+        else:
+            aug_file = config.data.augmentation_config
+            assert isinstance(aug_file, io.StringIO)
+
+        self._local_data = TarLocalData(tar2info={}, tar2object={}）
+        self._augmentation_pipeline = AugmentationPipeline(
+            augmentation_config=aug_file.read(), 
+            random_seed=config.data.random_seed)
+        
+        self._normalizer = FeatureNormalizer(
+            config.data.mean_std_filepath) if config.data.mean_std_filepath else None
+
+        self._stride_ms = config.data.stride_ms
+        self._target_sample_rate = config.data.target_sample_rate
+
+        self._speech_featurizer = SpeechFeaturizer(
+            unit_type=config.data.unit_type,
+            vocab_filepath=config.data.vocab_filepath,
+            spm_model_prefix=config.data.spm_model_prefix,
+            specgram_type=config.data.specgram_type,
+            feat_dim=config.data.feat_dim,
+            delta_delta=config.data.delta_delta,
+            stride_ms=config.data.stride_ms,
+            window_ms=config.data.window_ms,
+            n_fft=config.data.n_fft,
+            max_freq=config.data.max_freq,
+            target_sample_rate=config.data.target_sample_rate,
+            use_dB_normalization=config.data.use_dB_normalization,
+            target_dB=config.data.target_dB,
+            dither=config.data.dither)
+
+    def _parse_tar(self, file):
+        """Parse a tar file to get a tarfile object
+        and a map containing tarinfoes
+        """
+        result = {}
+        f = tarfile.open(file)
+        for tarinfo in f.getmembers():
+            result[tarinfo.name] = tarinfo
+        return f, result
+
+    def _subfile_from_tar(self, file):
+        """Get subfile object from tar.
+
+        It will return a subfile object from tar file
+        and cached tar file info for next reading request.
+        """
+        tarpath, filename = file.split(':', 1)[1].split('#', 1)
+        if 'tar2info' not in self._local_data.__dict__:
+            self._local_data.tar2info = {}
+        if 'tar2object' not in self._local_data.__dict__:
+            self._local_data.tar2object = {}
+        if tarpath not in self._local_data.tar2info:
+            object, infoes = self._parse_tar(tarpath)
+            self._local_data.tar2info[tarpath] = infoes
+            self._local_data.tar2object[tarpath] = object
+        return self._local_data.tar2object[tarpath].extractfile(
+            self._local_data.tar2info[tarpath][filename])
+
+    def process_utterance(self, audio_file, transcript):
+        """Load, augment, featurize and normalize for speech data.
+
+        :param audio_file: Filepath or file object of audio file.
+        :type audio_file: str | file
+        :param transcript: Transcription text.
+        :type transcript: str
+        :return: Tuple of audio feature tensor and data of transcription part,
+                 where transcription part could be token ids or text.
+        :rtype: tuple of (2darray, list)
+        """
+        start_time = time.time()
+        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
+            speech_segment = SpeechSegment.from_file(
+                self._subfile_from_tar(audio_file), transcript)
+        else:
+            speech_segment = SpeechSegment.from_file(audio_file, transcript)
+        load_wav_time = time.time() - start_time
+        #logger.debug(f"load wav time: {load_wav_time}")
+
+        # audio augment
+        start_time = time.time()
+        self._augmentation_pipeline.transform_audio(speech_segment)
+        audio_aug_time = time.time() - start_time
+        #logger.debug(f"audio augmentation time: {audio_aug_time}")
+
+        start_time = time.time()
+        specgram, transcript_part = self._speech_featurizer.featurize(
+            speech_segment, self._keep_transcription_text)
+        if self._normalizer:
+            specgram = self._normalizer.apply(specgram)
+        feature_time = time.time() - start_time
+        #logger.debug(f"audio & test feature time: {feature_time}")
+
+        # specgram augment
+        start_time = time.time()
+        specgram = self._augmentation_pipeline.transform_feature(specgram)
+        feature_aug_time = time.time() - start_time
+        #logger.debug(f"audio feature augmentation time: {feature_aug_time}")
+        return specgram, transcript_part
+
     def __call__(self, batch):
         """batch examples
 
@@ -53,6 +167,7 @@ class SpeechCollator():
         text_lens = []
         utts = []
         for utt, audio, text in batch:
+            audio, text = self.process_utterance(audio, text)
             #utt
             utts.append(utt)
             # audio
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index eaa57a4e..fc687902 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -34,9 +34,6 @@ __all__ = [
 
 logger = Log(__name__).getlog()
 
-# namedtupe need global for pickle.
-TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
-
 
 class ManifestDataset(Dataset):
     @classmethod
@@ -192,10 +189,6 @@ class ManifestDataset(Dataset):
         self._stride_ms = stride_ms
         self._target_sample_rate = target_sample_rate
 
-        self._normalizer = FeatureNormalizer(
-            mean_std_filepath) if mean_std_filepath else None
-        self._augmentation_pipeline = AugmentationPipeline(
-            augmentation_config=augmentation_config, random_seed=random_seed)
         self._speech_featurizer = SpeechFeaturizer(
             unit_type=unit_type,
             vocab_filepath=vocab_filepath,
@@ -214,8 +207,6 @@ class ManifestDataset(Dataset):
 
         self._rng = np.random.RandomState(random_seed)
         self._keep_transcription_text = keep_transcription_text
-        # for caching tar files info
-        self._local_data = TarLocalData(tar2info={}, tar2object={})
 
         # read manifest
         self._manifest = read_manifest(
@@ -256,74 +247,7 @@ class ManifestDataset(Dataset):
     def stride_ms(self):
         return self._speech_featurizer.stride_ms
 
-    def _parse_tar(self, file):
-        """Parse a tar file to get a tarfile object
-        and a map containing tarinfoes
-        """
-        result = {}
-        f = tarfile.open(file)
-        for tarinfo in f.getmembers():
-            result[tarinfo.name] = tarinfo
-        return f, result
-
-    def _subfile_from_tar(self, file):
-        """Get subfile object from tar.
 
-        It will return a subfile object from tar file
-        and cached tar file info for next reading request.
-        """
-        tarpath, filename = file.split(':', 1)[1].split('#', 1)
-        if 'tar2info' not in self._local_data.__dict__:
-            self._local_data.tar2info = {}
-        if 'tar2object' not in self._local_data.__dict__:
-            self._local_data.tar2object = {}
-        if tarpath not in self._local_data.tar2info:
-            object, infoes = self._parse_tar(tarpath)
-            self._local_data.tar2info[tarpath] = infoes
-            self._local_data.tar2object[tarpath] = object
-        return self._local_data.tar2object[tarpath].extractfile(
-            self._local_data.tar2info[tarpath][filename])
-
-    def process_utterance(self, utt, audio_file, transcript):
-        """Load, augment, featurize and normalize for speech data.
-
-        :param audio_file: Filepath or file object of audio file.
-        :type audio_file: str | file
-        :param transcript: Transcription text.
-        :type transcript: str
-        :return: Tuple of audio feature tensor and data of transcription part,
-                 where transcription part could be token ids or text.
-        :rtype: tuple of (2darray, list)
-        """
-        start_time = time.time()
-        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
-            speech_segment = SpeechSegment.from_file(
-                self._subfile_from_tar(audio_file), transcript)
-        else:
-            speech_segment = SpeechSegment.from_file(audio_file, transcript)
-        load_wav_time = time.time() - start_time
-        #logger.debug(f"load wav time: {load_wav_time}")
-
-        # audio augment
-        start_time = time.time()
-        self._augmentation_pipeline.transform_audio(speech_segment)
-        audio_aug_time = time.time() - start_time
-        #logger.debug(f"audio augmentation time: {audio_aug_time}")
-
-        start_time = time.time()
-        specgram, transcript_part = self._speech_featurizer.featurize(
-            speech_segment, self._keep_transcription_text)
-        if self._normalizer:
-            specgram = self._normalizer.apply(specgram)
-        feature_time = time.time() - start_time
-        #logger.debug(f"audio & test feature time: {feature_time}")
-
-        # specgram augment
-        start_time = time.time()
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
-        feature_aug_time = time.time() - start_time
-        #logger.debug(f"audio feature augmentation time: {feature_aug_time}")
-        return utt, specgram, transcript_part
 
     def _instance_reader_creator(self, manifest):
         """
@@ -336,8 +260,6 @@ class ManifestDataset(Dataset):
 
         def reader():
             for instance in manifest:
-                # inst = self.process_utterance(instance["feat"],
-                #                               instance["text"])
                 inst = self.process_utterance(instance["utt"], instance["feat"],
                                               instance["text"])
                 yield inst
@@ -349,6 +271,4 @@ class ManifestDataset(Dataset):
 
     def __getitem__(self, idx):
         instance = self._manifest[idx]
-        return self.process_utterance(instance["utt"], instance["feat"],
-                                      instance["text"])
-        # return self.process_utterance(instance["feat"], instance["text"])
+        return(instance["utt"], instance["feat"], instance["text"])
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml
index dd9ce51f..aeb4f099 100644
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -6,7 +6,7 @@ data:
   mean_std_filepath: data/mean_std.json
   vocab_filepath: data/vocab.txt 
   augmentation_config: conf/augmentation.json
-  batch_size: 4
+  batch_size: 2
   min_input_len: 0.0
   max_input_len: 27.0
   min_output_len: 0.0
@@ -37,7 +37,7 @@ model:
   share_rnn_weights: True 
 
 training:
-  n_epoch: 20
+  n_epoch: 10
   lr: 1e-5 
   lr_decay: 1.0 
   weight_decay: 1e-06

From c706dfec2ab292c91fe95cc1947330772c3bc493 Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Wed, 9 Jun 2021 12:54:01 +0000
Subject: [PATCH 02/14] fix bug

---
 deepspeech/exps/deepspeech2/model.py | 4 ++--
 deepspeech/io/collator.py            | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index 50ff3c17..bcd66d19 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -165,7 +165,7 @@ class DeepSpeech2Trainer(Trainer):
                 sortagrad=config.data.sortagrad,
                 shuffle_method=config.data.shuffle_method)
 
-        collate_fn = SpeechCollator(config, keep_transcription_text=False)
+        collate_fn = SpeechCollator(config=config, keep_transcription_text=False)
         self.train_loader = DataLoader(
             train_dataset,
             batch_sampler=batch_sampler,
@@ -342,7 +342,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
             batch_size=config.decoding.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=SpeechCollator(keep_transcription_text=True))
+            collate_fn=SpeechCollator(config=config, keep_transcription_text=True))
         logger.info("Setup test Dataloader!")
 
     def setup_output_dir(self):
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index d725b0b1..0f86b8e7 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -23,6 +23,8 @@ from deepspeech.frontend.speech import SpeechSegment
 import io
 import time
 
+from collections import namedtuple
+
 __all__ = ["SpeechCollator"]
 
 logger = Log(__name__).getlog()
@@ -50,7 +52,7 @@ class SpeechCollator():
             aug_file = config.data.augmentation_config
             assert isinstance(aug_file, io.StringIO)
 
-        self._local_data = TarLocalData(tar2info={}, tar2object={}）
+        self._local_data = TarLocalData(tar2info={}, tar2object={})
         self._augmentation_pipeline = AugmentationPipeline(
             augmentation_config=aug_file.read(), 
             random_seed=config.data.random_seed)

From 2b51d612dd64653bb407f76b648a48ad71b090de Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Wed, 9 Jun 2021 13:42:19 +0000
Subject: [PATCH 03/14] delete _instance_reader_creator func in dataset

---
 deepspeech/io/dataset.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index fc687902..929a6cf8 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -249,22 +249,22 @@ class ManifestDataset(Dataset):
 
 
 
-    def _instance_reader_creator(self, manifest):
-        """
-        Instance reader creator. Create a callable function to produce
-        instances of data.
-
-        Instance: a tuple of ndarray of audio spectrogram and a list of
-        token indices for transcript.
-        """
-
-        def reader():
-            for instance in manifest:
-                inst = self.process_utterance(instance["utt"], instance["feat"],
-                                              instance["text"])
-                yield inst
-
-        return reader
+    # def _instance_reader_creator(self, manifest):
+    #     """
+    #     Instance reader creator. Create a callable function to produce
+    #     instances of data.
+
+    #     Instance: a tuple of ndarray of audio spectrogram and a list of
+    #     token indices for transcript.
+    #     """
+
+    #     def reader():
+    #         for instance in manifest:
+    #             inst = self.process_utterance(instance["utt"], instance["feat"],
+    #                                           instance["text"])
+    #             yield inst
+
+    #     return reader
 
     def __len__(self):
         return len(self._manifest)

From 3d5f294363ebc3a732b5f29714f9b057431ed52c Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Thu, 10 Jun 2021 03:13:35 +0000
Subject: [PATCH 04/14] dataset

---
 deepspeech/io/dataset.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index 929a6cf8..6083d7ec 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -247,25 +247,6 @@ class ManifestDataset(Dataset):
     def stride_ms(self):
         return self._speech_featurizer.stride_ms
 
-
-
-    # def _instance_reader_creator(self, manifest):
-    #     """
-    #     Instance reader creator. Create a callable function to produce
-    #     instances of data.
-
-    #     Instance: a tuple of ndarray of audio spectrogram and a list of
-    #     token indices for transcript.
-    #     """
-
-    #     def reader():
-    #         for instance in manifest:
-    #             inst = self.process_utterance(instance["utt"], instance["feat"],
-    #                                           instance["text"])
-    #             yield inst
-
-    #     return reader
-
     def __len__(self):
         return len(self._manifest)
 

From 3855522ee3b43bc5726eb7f37a0dd8bd0e9355a2 Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Thu, 10 Jun 2021 11:37:25 +0000
Subject: [PATCH 05/14] config

---
 deepspeech/exps/deepspeech2/config.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py
index a8d452a9..37b00086 100644
--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
@@ -38,8 +38,6 @@ _C.data = CN(
         target_sample_rate=16000,  # target sample rate
         use_dB_normalization=True,
         target_dB=-20,
-        random_seed=0,
-        keep_transcription_text=False,
         batch_size=32,  # batch size
         num_workers=0,  # data loader workers
         sortagrad=False,  # sorted in first epoch when True
@@ -55,6 +53,28 @@ _C.model = CN(
         share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
     ))
 
+_C.collator =CN(
+    dict(
+        augmentation_config="",
+        random_seed=0,
+        mean_std_filepath="",
+        unit_type="char",
+        vocab_filepath="",
+        spm_model_prefix="",
+        specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
+        feat_dim=0,  # 'mfcc', 'fbank'
+        delta_delta=False,  # 'mfcc', 'fbank'
+        stride_ms=10.0,  # ms
+        window_ms=20.0,  # ms
+        n_fft=None,  # fft points
+        max_freq=None,  # None for samplerate/2
+        target_sample_rate=16000,  # target sample rate
+        use_dB_normalization=True,
+        target_dB=-20,
+        dither=1.0,  # feature dither
+        keep_transcription_text=True
+    ))
+
 DeepSpeech2Model.params(_C.model)
 
 _C.training = CN(

From b9110af9d340caf4e3e32e0eafa2fca6946d7296 Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Fri, 11 Jun 2021 02:44:02 +0000
Subject: [PATCH 06/14] feat_dim, vocab_size

---
 deepspeech/exps/deepspeech2/model.py          |   4 +-
 .../frontend/featurizer/speech_featurizer.py  |  43 -----
 deepspeech/frontend/utility.py                |   2 +-
 deepspeech/io/collator.py                     | 166 +++++++++++++++---
 deepspeech/io/dataset.py                      | 128 +++++++-------
 examples/tiny/s0/conf/deepspeech2.yaml        |  23 ++-
 6 files changed, 227 insertions(+), 139 deletions(-)

diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index bcd66d19..679261cf 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -137,7 +137,7 @@ class DeepSpeech2Trainer(Trainer):
     def setup_dataloader(self):
         config = self.config.clone()
         config.defrost()
-        config.data.keep_transcription_text = False
+        config.collator.keep_transcription_text = False
 
         config.data.manifest = config.data.train_manifest
         train_dataset = ManifestDataset.from_config(config)
@@ -165,7 +165,7 @@ class DeepSpeech2Trainer(Trainer):
                 sortagrad=config.data.sortagrad,
                 shuffle_method=config.data.shuffle_method)
 
-        collate_fn = SpeechCollator(config=config, keep_transcription_text=False)
+        collate_fn = SpeechCollator.from_config(config)
         self.train_loader = DataLoader(
             train_dataset,
             batch_sampler=batch_sampler,
diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py
index e6761cb5..bcb8e3f4 100644
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@@ -104,50 +104,7 @@ class SpeechFeaturizer(object):
                 speech_segment.transcript)
         return spec_feature, text_ids
 
-    @property
-    def vocab_size(self):
-        """Return the vocabulary size.
-
-        Returns:
-            int: Vocabulary size.
-        """
-        return self._text_featurizer.vocab_size
-
-    @property
-    def vocab_list(self):
-        """Return the vocabulary in list.
 
-        Returns:
-            List[str]: 
-        """
-        return self._text_featurizer.vocab_list
-
-    @property
-    def vocab_dict(self):
-        """Return the vocabulary in dict.
-
-        Returns:
-            Dict[str, int]: 
-        """
-        return self._text_featurizer.vocab_dict
-
-    @property
-    def feature_size(self):
-        """Return the audio feature size.
-
-        Returns:
-            int: audio feature size.
-        """
-        return self._audio_featurizer.feature_size
-
-    @property
-    def stride_ms(self):
-        """time length in `ms` unit per frame
-
-        Returns:
-            float: time(ms)/frame
-        """
-        return self._audio_featurizer.stride_ms
 
     @property
     def text_feature(self):
diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py
index b2dd9601..610104f9 100644
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@@ -82,7 +82,7 @@ def read_manifest(
         ]
         if all(conditions):
             manifest.append(json_data)
-    return manifest
+    return manifest, json_data["feat_shape"][-1]
 
 
 def rms_to_db(rms: float):
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index 0f86b8e7..4efc69a0 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -22,6 +22,8 @@ from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
 import io
 import time
+from yacs.config import CfgNode
+from typing import Optional
 
 from collections import namedtuple
 
@@ -33,51 +35,134 @@ logger = Log(__name__).getlog()
 TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
 
 class SpeechCollator():
-    def __init__(self, config, keep_transcription_text=True):
-        """
-        Padding audio features with zeros to make them have the same shape (or
-        a user-defined shape) within one bach.
+    @classmethod
+    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
+        default = CfgNode(
+            dict(
+                augmentation_config="",
+                random_seed=0,
+                mean_std_filepath="",
+                unit_type="char",
+                vocab_filepath="",
+                spm_model_prefix="",
+                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
+                feat_dim=0,  # 'mfcc', 'fbank'
+                delta_delta=False,  # 'mfcc', 'fbank'
+                stride_ms=10.0,  # ms
+                window_ms=20.0,  # ms
+                n_fft=None,  # fft points
+                max_freq=None,  # None for samplerate/2
+                target_sample_rate=16000,  # target sample rate
+                use_dB_normalization=True,
+                target_dB=-20,
+                dither=1.0,  # feature dither
+                keep_transcription_text=True
+            ))
 
-        if ``keep_transcription_text`` is False, text is token ids else is raw string.
+        if config is not None:
+            config.merge_from_other_cfg(default)
+        return default
+
+    @classmethod
+    def from_config(cls, config):
+        """Build a SpeechCollator object from a config.
+
+        Args:
+            config (yacs.config.CfgNode): configs object.
+
+        Returns:
+            SpeechCollator: collator object.
         """
-        self._keep_transcription_text = keep_transcription_text
+        assert 'augmentation_config' in config.collator
+        assert 'keep_transcription_text' in config.collator
+        assert 'mean_std_filepath' in config.collator
+        assert 'vocab_filepath' in config.data
+        assert 'specgram_type' in config.collator
+        assert 'n_fft' in config.collator
+        assert config.collator
 
-        if isinstance(config.data.augmentation_config, (str, bytes)):
-            if config.data.augmentation_config:
+        if isinstance(config.collator.augmentation_config, (str, bytes)):
+            if config.collator.augmentation_config:
                 aug_file = io.open(
-                    config.data.augmentation_config, mode='r', encoding='utf8')
+                    config.collator.augmentation_config, mode='r', encoding='utf8')
             else:
                 aug_file = io.StringIO(initial_value='{}', newline='')
         else:
-            aug_file = config.data.augmentation_config
+            aug_file = config.collator.augmentation_config
             assert isinstance(aug_file, io.StringIO)
 
+        speech_collator = cls(
+                aug_file=aug_file,
+                random_seed=0,
+                mean_std_filepath=config.collator.mean_std_filepath,
+                unit_type=config.collator.unit_type,
+                vocab_filepath=config.data.vocab_filepath,
+                spm_model_prefix=config.collator.spm_model_prefix,
+                specgram_type=config.collator.specgram_type, 
+                feat_dim=config.collator.feat_dim, 
+                delta_delta=config.collator.delta_delta, 
+                stride_ms=config.collator.stride_ms, 
+                window_ms=config.collator.window_ms, 
+                n_fft=config.collator.n_fft, 
+                max_freq=config.collator.max_freq, 
+                target_sample_rate=config.collator.target_sample_rate, 
+                use_dB_normalization=config.collator.use_dB_normalization,
+                target_dB=config.collator.target_dB,
+                dither=config.collator.dither, 
+                keep_transcription_text=config.collator.keep_transcription_text
+            )
+        return speech_collator
+
+    def __init__(self, aug_file, mean_std_filepath,  
+                vocab_filepath, spm_model_prefix,
+                random_seed=0,
+                unit_type="char",
+                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
+                feat_dim=0,  # 'mfcc', 'fbank'
+                delta_delta=False,  # 'mfcc', 'fbank'
+                stride_ms=10.0,  # ms
+                window_ms=20.0,  # ms
+                n_fft=None,  # fft points
+                max_freq=None,  # None for samplerate/2
+                target_sample_rate=16000,  # target sample rate
+                use_dB_normalization=True,
+                target_dB=-20,
+                dither=1.0,
+                keep_transcription_text=True):
+        """
+        Padding audio features with zeros to make them have the same shape (or
+        a user-defined shape) within one bach.
+
+        if ``keep_transcription_text`` is False, text is token ids else is raw string.
+        """
+        self._keep_transcription_text = keep_transcription_text
+
         self._local_data = TarLocalData(tar2info={}, tar2object={})
         self._augmentation_pipeline = AugmentationPipeline(
             augmentation_config=aug_file.read(), 
-            random_seed=config.data.random_seed)
+            random_seed=random_seed)
         
         self._normalizer = FeatureNormalizer(
-            config.data.mean_std_filepath) if config.data.mean_std_filepath else None
+            mean_std_filepath) if mean_std_filepath else None
 
-        self._stride_ms = config.data.stride_ms
-        self._target_sample_rate = config.data.target_sample_rate
+        self._stride_ms = stride_ms
+        self._target_sample_rate = target_sample_rate
 
         self._speech_featurizer = SpeechFeaturizer(
-            unit_type=config.data.unit_type,
-            vocab_filepath=config.data.vocab_filepath,
-            spm_model_prefix=config.data.spm_model_prefix,
-            specgram_type=config.data.specgram_type,
-            feat_dim=config.data.feat_dim,
-            delta_delta=config.data.delta_delta,
-            stride_ms=config.data.stride_ms,
-            window_ms=config.data.window_ms,
-            n_fft=config.data.n_fft,
-            max_freq=config.data.max_freq,
-            target_sample_rate=config.data.target_sample_rate,
-            use_dB_normalization=config.data.use_dB_normalization,
-            target_dB=config.data.target_dB,
-            dither=config.data.dither)
+            unit_type=unit_type,
+            vocab_filepath=vocab_filepath,
+            spm_model_prefix=spm_model_prefix,
+            specgram_type=specgram_type,
+            feat_dim=feat_dim,
+            delta_delta=delta_delta,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            n_fft=n_fft,
+            max_freq=max_freq,
+            target_sample_rate=target_sample_rate,
+            use_dB_normalization=use_dB_normalization,
+            target_dB=target_dB,
+            dither=dither)
 
     def _parse_tar(self, file):
         """Parse a tar file to get a tarfile object
@@ -196,3 +281,28 @@ class SpeechCollator():
             texts, padding_value=IGNORE_ID).astype(np.int64)
         text_lens = np.array(text_lens).astype(np.int64)
         return utts, padded_audios, audio_lens, padded_texts, text_lens
+
+    @property
+    def vocab_size(self):
+        return self._speech_featurizer.vocab_size
+
+    @property
+    def vocab_list(self):
+        return self._speech_featurizer.vocab_list
+
+    @property
+    def vocab_dict(self):
+        return self._speech_featurizer.vocab_dict
+
+    @property
+    def text_feature(self):
+        return self._text_featurizer
+        self._speech_featurizer.text_feature
+
+    @property
+    def feature_size(self):
+        return self._speech_featurizer.feature_size
+
+    @property
+    def stride_ms(self):
+        return self._speech_featurizer.stride_ms
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index aa5b29ed..1e3bbcd3 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -55,20 +55,6 @@ class ManifestDataset(Dataset):
                 min_output_len=0.0,
                 max_output_input_ratio=float('inf'),
                 min_output_input_ratio=0.0,
-                stride_ms=10.0,  # ms
-                window_ms=20.0,  # ms
-                n_fft=None,  # fft points
-                max_freq=None,  # None for samplerate/2
-                raw_wav=True,  # use raw_wav or kaldi feature
-                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-                feat_dim=0,  # 'mfcc', 'fbank'
-                delta_delta=False,  # 'mfcc', 'fbank'
-                dither=1.0,  # feature dither
-                target_sample_rate=16000,  # target sample rate
-                use_dB_normalization=True,
-                target_dB=-20,
-                random_seed=0,
-                keep_transcription_text=False,
                 batch_size=32,  # batch size
                 num_workers=0,  # data loader workers
                 sortagrad=False,  # sorted in first epoch when True
@@ -116,21 +102,19 @@ class ManifestDataset(Dataset):
             min_output_len=config.data.min_output_len,
             max_output_input_ratio=config.data.max_output_input_ratio,
             min_output_input_ratio=config.data.min_output_input_ratio,
-            stride_ms=config.data.stride_ms,
-            window_ms=config.data.window_ms,
-            n_fft=config.data.n_fft,
-            max_freq=config.data.max_freq,
-            target_sample_rate=config.data.target_sample_rate,
-            specgram_type=config.data.specgram_type,
-            feat_dim=config.data.feat_dim,
-            delta_delta=config.data.delta_delta,
-            dither=config.data.dither,
-            use_dB_normalization=config.data.use_dB_normalization,
-            target_dB=config.data.target_dB,
-            random_seed=config.data.random_seed,
-            keep_transcription_text=config.data.keep_transcription_text)
+            )
         return dataset
 
+    
+    def _read_vocab(self, vocab_filepath):
+        """Load vocabulary from file."""
+        vocab_lines = []
+        with open(vocab_filepath, 'r', encoding='utf-8') as file:
+            vocab_lines.extend(file.readlines())
+        vocab_list = [line[:-1] for line in vocab_lines]
+        return vocab_list
+
+
     def __init__(self,
                  manifest_path,
                  unit_type,
@@ -143,20 +127,7 @@ class ManifestDataset(Dataset):
                  max_output_len=float('inf'),
                  min_output_len=0.0,
                  max_output_input_ratio=float('inf'),
-                 min_output_input_ratio=0.0,
-                 stride_ms=10.0,
-                 window_ms=20.0,
-                 n_fft=None,
-                 max_freq=None,
-                 target_sample_rate=16000,
-                 specgram_type='linear',
-                 feat_dim=None,
-                 delta_delta=False,
-                 dither=1.0,
-                 use_dB_normalization=True,
-                 target_dB=-20,
-                 random_seed=0,
-                 keep_transcription_text=False):
+                 min_output_input_ratio=0.0):
         """Manifest Dataset
 
         Args:
@@ -186,30 +157,11 @@ class ManifestDataset(Dataset):
             keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
         """
         super().__init__()
-        self._stride_ms = stride_ms
-        self._target_sample_rate = target_sample_rate
-
-        self._speech_featurizer = SpeechFeaturizer(
-            unit_type=unit_type,
-            vocab_filepath=vocab_filepath,
-            spm_model_prefix=spm_model_prefix,
-            specgram_type=specgram_type,
-            feat_dim=feat_dim,
-            delta_delta=delta_delta,
-            stride_ms=stride_ms,
-            window_ms=window_ms,
-            n_fft=n_fft,
-            max_freq=max_freq,
-            target_sample_rate=target_sample_rate,
-            use_dB_normalization=use_dB_normalization,
-            target_dB=target_dB,
-            dither=dither)
-
-        self._rng = np.random.RandomState(random_seed)
-        self._keep_transcription_text = keep_transcription_text
+
+        # self._rng = np.random.RandomState(random_seed)
 
         # read manifest
-        self._manifest = read_manifest(
+        self._manifest, self._feature_size = read_manifest(
             manifest_path=manifest_path,
             max_input_len=max_input_len,
             min_input_len=min_input_len,
@@ -219,9 +171,59 @@ class ManifestDataset(Dataset):
             min_output_input_ratio=min_output_input_ratio)
         self._manifest.sort(key=lambda x: x["feat_shape"][0])
 
+        self._vocab_list = self._read_vocab(vocab_filepath)
+
     @property
     def manifest(self):
         return self._manifest
+    
+    @property
+    def vocab_size(self):
+        """Return the vocabulary size.
+
+        Returns:
+            int: Vocabulary size.
+        """
+        return len(self._vocab_list)
+
+    @property
+    def vocab_list(self):
+        """Return the vocabulary in list.
+
+        Returns:
+            List[str]: 
+        """
+        return self._vocab_list
+
+    @property
+    def vocab_dict(self):
+        """Return the vocabulary in dict.
+
+        Returns:
+            Dict[str, int]: 
+        """
+        vocab_dict = dict(
+            [(token, idx) for (idx, token) in enumerate(self._vocab_list)])
+        return vocab_dict
+
+    @property
+    def feature_size(self):
+        """Return the audio feature size.
+
+        Returns:
+            int: audio feature size.
+        """
+        return self._feature_size
+
+    @property
+    def stride_ms(self):
+        """time length in `ms` unit per frame
+
+        Returns:
+            float: time(ms)/frame
+        """
+        return self._audio_featurizer.stride_ms
+    
 
     def __len__(self):
         return len(self._manifest)
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml
index aeb4f099..eda7c3cb 100644
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -4,9 +4,10 @@ data:
   dev_manifest: data/manifest.tiny
   test_manifest: data/manifest.tiny
   mean_std_filepath: data/mean_std.json
+  unit_type: char
   vocab_filepath: data/vocab.txt 
   augmentation_config: conf/augmentation.json
-  batch_size: 2
+  batch_size: 4
   min_input_len: 0.0
   max_input_len: 27.0
   min_output_len: 0.0
@@ -28,6 +29,24 @@ data:
   sortagrad: True 
   shuffle_method: batch_shuffle
   num_workers: 0
+
+collator:
+  augmentation_config: conf/augmentation.json
+  random_seed: 0
+  mean_std_filepath: data/mean_std.json
+  spm_model_prefix: 
+  specgram_type: linear
+  feat_dim: 
+  delta_delta: False
+  stride_ms: 10.0
+  window_ms: 20.0
+  n_fft: None
+  max_freq: None
+  target_sample_rate: 16000
+  use_dB_normalization: True
+  target_dB: -20
+  dither: 1.0
+  keep_transcription_text: True
   
 model:
   num_conv_layers: 2
@@ -37,7 +56,7 @@ model:
   share_rnn_weights: True 
 
 training:
-  n_epoch: 10
+  n_epoch: 21
   lr: 1e-5 
   lr_decay: 1.0 
   weight_decay: 1e-06

From 7bae32f3844166d549d0180da70b13bd10ef4cf7 Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Tue, 15 Jun 2021 03:05:22 +0000
Subject: [PATCH 07/14] revise example/ting/s1

---
 deepspeech/exps/deepspeech2/config.py  |  2 +-
 deepspeech/exps/deepspeech2/model.py   |  3 ++-
 deepspeech/exps/u2/config.py           |  7 +++++++
 deepspeech/exps/u2/model.py            |  9 +++++----
 deepspeech/frontend/utility.py         |  2 +-
 deepspeech/io/collator.py              | 23 ++++-----------------
 deepspeech/io/dataset.py               | 12 ++---------
 examples/tiny/s0/conf/deepspeech2.yaml | 16 +--------------
 examples/tiny/s1/conf/transformer.yaml | 28 ++++++++++++++------------
 9 files changed, 38 insertions(+), 64 deletions(-)

diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py
index 37b00086..1ce5346f 100644
--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
@@ -72,7 +72,7 @@ _C.collator =CN(
         use_dB_normalization=True,
         target_dB=-20,
         dither=1.0,  # feature dither
-        keep_transcription_text=True
+        keep_transcription_text=False
     ))
 
 DeepSpeech2Model.params(_C.model)
diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index 679261cf..7769c377 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -336,13 +336,14 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
         # config.data.max_output_input_ratio = float('inf')
         test_dataset = ManifestDataset.from_config(config)
 
+        config.collator.keep_transcription_text = True
         # return text ord id
         self.test_loader = DataLoader(
             test_dataset,
             batch_size=config.decoding.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=SpeechCollator(config=config, keep_transcription_text=True))
+            collate_fn=SpeechCollator.from_config(config))
         logger.info("Setup test Dataloader!")
 
     def setup_output_dir(self):
diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py
index 5a0b53f9..19080be7 100644
--- a/deepspeech/exps/u2/config.py
+++ b/deepspeech/exps/u2/config.py
@@ -22,6 +22,13 @@ _C = CfgNode()
 
 _C.data = ManifestDataset.params()
 
+_C.collator =CfgNode(
+    dict(
+        augmentation_config="",
+        unit_type="char",
+        keep_transcription_text=False
+    ))
+
 _C.model = U2Model.params()
 
 _C.training = U2Trainer.params()
diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 334d6bc8..89527087 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -221,7 +221,7 @@ class U2Trainer(Trainer):
         config.data.augmentation_config = ""
         dev_dataset = ManifestDataset.from_config(config)
 
-        collate_fn = SpeechCollator(keep_transcription_text=False)
+        collate_fn = SpeechCollator.from_config(config)
         if self.parallel:
             batch_sampler = SortagradDistributedBatchSampler(
                 train_dataset,
@@ -266,12 +266,13 @@ class U2Trainer(Trainer):
         # config.data.max_output_input_ratio = float('inf')
         test_dataset = ManifestDataset.from_config(config)
         # return text ord id
+        config.collator.keep_transcription_text = True
         self.test_loader = DataLoader(
             test_dataset,
             batch_size=config.decoding.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=SpeechCollator(keep_transcription_text=True))
+            collate_fn=SpeechCollator.from_config(config))
         logger.info("Setup train/valid/test Dataloader!")
 
     def setup_model(self):
@@ -375,7 +376,7 @@ class U2Tester(U2Trainer):
         error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
 
         start_time = time.time()
-        text_feature = self.test_loader.dataset.text_feature
+        text_feature = self.test_loader.collate_fn.text_feature
         target_transcripts = self.ordid2token(texts, texts_len)
         result_transcripts = self.model.decode(
             audio,
@@ -423,7 +424,7 @@ class U2Tester(U2Trainer):
         self.model.eval()
         logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
 
-        stride_ms = self.test_loader.dataset.stride_ms
+        stride_ms = self.config.collator.stride_ms
         error_rate_type = None
         errors_sum, len_refs, num_ins = 0.0, 0, 0
         num_frames = 0.0
diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py
index 610104f9..b2dd9601 100644
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@@ -82,7 +82,7 @@ def read_manifest(
         ]
         if all(conditions):
             manifest.append(json_data)
-    return manifest, json_data["feat_shape"][-1]
+    return manifest
 
 
 def rms_to_db(rms: float):
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index 4efc69a0..51384ec4 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -56,7 +56,7 @@ class SpeechCollator():
                 use_dB_normalization=True,
                 target_dB=-20,
                 dither=1.0,  # feature dither
-                keep_transcription_text=True
+                keep_transcription_text=False
             ))
 
         if config is not None:
@@ -75,7 +75,7 @@ class SpeechCollator():
         """
         assert 'augmentation_config' in config.collator
         assert 'keep_transcription_text' in config.collator
-        assert 'mean_std_filepath' in config.collator
+        assert 'mean_std_filepath' in config.data
         assert 'vocab_filepath' in config.data
         assert 'specgram_type' in config.collator
         assert 'n_fft' in config.collator
@@ -94,7 +94,7 @@ class SpeechCollator():
         speech_collator = cls(
                 aug_file=aug_file,
                 random_seed=0,
-                mean_std_filepath=config.collator.mean_std_filepath,
+                mean_std_filepath=config.data.mean_std_filepath,
                 unit_type=config.collator.unit_type,
                 vocab_filepath=config.data.vocab_filepath,
                 spm_model_prefix=config.collator.spm_model_prefix,
@@ -282,26 +282,11 @@ class SpeechCollator():
         text_lens = np.array(text_lens).astype(np.int64)
         return utts, padded_audios, audio_lens, padded_texts, text_lens
 
-    @property
-    def vocab_size(self):
-        return self._speech_featurizer.vocab_size
-
-    @property
-    def vocab_list(self):
-        return self._speech_featurizer.vocab_list
-
-    @property
-    def vocab_dict(self):
-        return self._speech_featurizer.vocab_dict
 
     @property
     def text_feature(self):
-        return self._text_featurizer
-        self._speech_featurizer.text_feature
+        return self._speech_featurizer.text_feature
 
-    @property
-    def feature_size(self):
-        return self._speech_featurizer.feature_size
 
     @property
     def stride_ms(self):
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index 1e3bbcd3..0da347f3 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -161,7 +161,7 @@ class ManifestDataset(Dataset):
         # self._rng = np.random.RandomState(random_seed)
 
         # read manifest
-        self._manifest, self._feature_size = read_manifest(
+        self._manifest = read_manifest(
             manifest_path=manifest_path,
             max_input_len=max_input_len,
             min_input_len=min_input_len,
@@ -213,16 +213,8 @@ class ManifestDataset(Dataset):
         Returns:
             int: audio feature size.
         """
-        return self._feature_size
+        return self._manifest[0]["feat_shape"][-1]
 
-    @property
-    def stride_ms(self):
-        """time length in `ms` unit per frame
-
-        Returns:
-            float: time(ms)/frame
-        """
-        return self._audio_featurizer.stride_ms
     
 
     def __len__(self):
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml
index eda7c3cb..bfed8d59 100644
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -6,7 +6,6 @@ data:
   mean_std_filepath: data/mean_std.json
   unit_type: char
   vocab_filepath: data/vocab.txt 
-  augmentation_config: conf/augmentation.json
   batch_size: 4
   min_input_len: 0.0
   max_input_len: 27.0
@@ -14,18 +13,6 @@ data:
   max_output_len: 400.0
   min_output_input_ratio: 0.05
   max_output_input_ratio: 10.0
-  specgram_type: linear
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 20.0
-  delta_delta: False
-  dither: 1.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
   sortagrad: True 
   shuffle_method: batch_shuffle
   num_workers: 0
@@ -33,7 +20,6 @@ data:
 collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
-  mean_std_filepath: data/mean_std.json
   spm_model_prefix: 
   specgram_type: linear
   feat_dim: 
@@ -46,7 +32,7 @@ collator:
   use_dB_normalization: True
   target_dB: -20
   dither: 1.0
-  keep_transcription_text: True
+  keep_transcription_text: False
   
 model:
   num_conv_layers: 2
diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml
index 0a7cf3be..cc172585 100644
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -7,7 +7,6 @@ data:
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_200'
   mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
   batch_size: 4
   min_input_len: 0.5  # second
   max_input_len: 20.0 # second
@@ -16,23 +15,26 @@ data:
   min_output_input_ratio: 0.05
   max_output_input_ratio: 10.0
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 0 #2
+
+collator:
+  augmentation_config: conf/augmentation.json
+  random_seed: 0
+  spm_model_prefix: 
+  specgram_type: fbank
   feat_dim: 80
   delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
   stride_ms: 10.0
-  window_ms: 25.0
+  window_ms: 20.0
+  n_fft: None
+  max_freq: None
+  target_sample_rate: 16000
   use_dB_normalization: True
   target_dB: -20
-  random_seed: 0
+  dither: 1.0
   keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
-
 
 # network architecture
 model:
@@ -70,7 +72,7 @@ model:
 
 
 training:
-  n_epoch: 2
+  n_epoch: 3
   accum_grad: 1
   global_grad_clip: 5.0
   optim: adam

From 6ee3033cc4561ab3109ee036c3c8db9101d1c2b7 Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Wed, 16 Jun 2021 14:39:00 +0000
Subject: [PATCH 08/14] finish aishell/s0

---
 deepspeech/exps/deepspeech2/model.py          | 12 +--
 deepspeech/exps/u2/model.py                   |  6 +-
 .../frontend/featurizer/speech_featurizer.py  | 49 +++++++++-
 deepspeech/io/collator.py                     | 32 ++++++-
 deepspeech/io/dataset.py                      | 90 +++++++++----------
 examples/aishell/s0/conf/deepspeech2.yaml     | 24 ++---
 examples/tiny/s0/conf/deepspeech2.yaml        |  2 +-
 examples/tiny/s1/conf/transformer.yaml        |  2 +-
 8 files changed, 147 insertions(+), 70 deletions(-)

diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index 7769c377..5833382a 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -102,8 +102,8 @@ class DeepSpeech2Trainer(Trainer):
     def setup_model(self):
         config = self.config
         model = DeepSpeech2Model(
-            feat_size=self.train_loader.dataset.feature_size,
-            dict_size=self.train_loader.dataset.vocab_size,
+            feat_size=self.train_loader.collate_fn.feature_size,
+            dict_size=self.train_loader.collate_fn.vocab_size,
             num_conv_layers=config.model.num_conv_layers,
             num_rnn_layers=config.model.num_rnn_layers,
             rnn_size=config.model.rnn_layer_size,
@@ -199,7 +199,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
         errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
         error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
 
-        vocab_list = self.test_loader.dataset.vocab_list
+        vocab_list = self.test_loader.collate_fn.vocab_list
 
         target_transcripts = self.ordid2token(texts, texts_len)
         result_transcripts = self.model.decode(
@@ -272,7 +272,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
         infer_model = DeepSpeech2InferModel.from_pretrained(
             self.test_loader.dataset, self.config, self.args.checkpoint_path)
         infer_model.eval()
-        feat_dim = self.test_loader.dataset.feature_size
+        feat_dim = self.test_loader.collate_fn.feature_size
         static_model = paddle.jit.to_static(
             infer_model,
             input_spec=[
@@ -308,8 +308,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
     def setup_model(self):
         config = self.config
         model = DeepSpeech2Model(
-            feat_size=self.test_loader.dataset.feature_size,
-            dict_size=self.test_loader.dataset.vocab_size,
+            feat_size=self.test_loader.collate_fn.feature_size,
+            dict_size=self.test_loader.collate_fn.vocab_size,
             num_conv_layers=config.model.num_conv_layers,
             num_rnn_layers=config.model.num_rnn_layers,
             rnn_size=config.model.rnn_layer_size,
diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 89527087..676768ce 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -279,8 +279,8 @@ class U2Trainer(Trainer):
         config = self.config
         model_conf = config.model
         model_conf.defrost()
-        model_conf.input_dim = self.train_loader.dataset.feature_size
-        model_conf.output_dim = self.train_loader.dataset.vocab_size
+        model_conf.input_dim = self.train_loader.collate_fn.feature_size
+        model_conf.output_dim = self.train_loader.collate_fn.vocab_size
         model_conf.freeze()
         model = U2Model.from_config(model_conf)
 
@@ -497,7 +497,7 @@ class U2Tester(U2Trainer):
         infer_model = U2InferModel.from_pretrained(self.test_loader.dataset,
                                                    self.config.model.clone(),
                                                    self.args.checkpoint_path)
-        feat_dim = self.test_loader.dataset.feature_size
+        feat_dim = self.test_loader.collate_fn.feature_size
         input_spec = [
             paddle.static.InputSpec(
                 shape=[None, feat_dim, None],
diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py
index bcb8e3f4..852d26c9 100644
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@@ -104,13 +104,60 @@ class SpeechFeaturizer(object):
                 speech_segment.transcript)
         return spec_feature, text_ids
 
+    @property
+    def vocab_size(self):
+        """Return the vocabulary size.
+        Returns:
+            int: Vocabulary size.
+        """
+        return self._text_featurizer.vocab_size
 
+    @property
+    def vocab_list(self):
+        """Return the vocabulary in list.
+        Returns:
+            List[str]: 
+        """
+        return self._text_featurizer.vocab_list
+
+    @property
+    def vocab_dict(self):
+        """Return the vocabulary in dict.
+        Returns:
+            Dict[str, int]: 
+        """
+        return self._text_featurizer.vocab_dict
+
+    @property
+    def feature_size(self):
+        """Return the audio feature size.
+        Returns:
+            int: audio feature size.
+        """
+        return self._audio_featurizer.feature_size
+
+    @property
+    def stride_ms(self):
+        """time length in `ms` unit per frame
+        Returns:
+            float: time(ms)/frame
+        """
+        return self._audio_featurizer.stride_ms
 
     @property
     def text_feature(self):
         """Return the text feature object.
-
         Returns:
             TextFeaturizer: object.
         """
         return self._text_featurizer
+        
+
+    # @property
+    # def text_feature(self):
+    #     """Return the text feature object.
+
+    #     Returns:
+    #         TextFeaturizer: object.
+    #     """
+    #     return self._text_featurizer
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index 51384ec4..8b8575db 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -283,11 +283,41 @@ class SpeechCollator():
         return utts, padded_audios, audio_lens, padded_texts, text_lens
 
 
+    # @property
+    # def text_feature(self):
+    #     return self._speech_featurizer.text_feature
+
+
+    # @property
+    # def stride_ms(self):
+    #     return self._speech_featurizer.stride_ms
+
+###########
+    
+    @property
+    def manifest(self):
+        return self._manifest
+
+    @property
+    def vocab_size(self):
+        return self._speech_featurizer.vocab_size
+
+    @property
+    def vocab_list(self):
+        return self._speech_featurizer.vocab_list
+
+    @property
+    def vocab_dict(self):
+        return self._speech_featurizer.vocab_dict
+
     @property
     def text_feature(self):
         return self._speech_featurizer.text_feature
 
+    @property
+    def feature_size(self):
+        return self._speech_featurizer.feature_size
 
     @property
     def stride_ms(self):
-        return self._speech_featurizer.stride_ms
+        return self._speech_featurizer.stride_ms
\ No newline at end of file
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index 0da347f3..24d8486a 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -55,10 +55,6 @@ class ManifestDataset(Dataset):
                 min_output_len=0.0,
                 max_output_input_ratio=float('inf'),
                 min_output_input_ratio=0.0,
-                batch_size=32,  # batch size
-                num_workers=0,  # data loader workers
-                sortagrad=False,  # sorted in first epoch when True
-                shuffle_method="batch_shuffle",  # 'batch_shuffle', 'instance_shuffle'
             ))
 
         if config is not None:
@@ -77,7 +73,7 @@ class ManifestDataset(Dataset):
         """
         assert 'manifest' in config.data
         assert config.data.manifest
-        assert 'keep_transcription_text' in config.data
+        assert 'keep_transcription_text' in config.collator
 
         if isinstance(config.data.augmentation_config, (str, bytes)):
             if config.data.augmentation_config:
@@ -171,51 +167,51 @@ class ManifestDataset(Dataset):
             min_output_input_ratio=min_output_input_ratio)
         self._manifest.sort(key=lambda x: x["feat_shape"][0])
 
-        self._vocab_list = self._read_vocab(vocab_filepath)
+        # self._vocab_list = self._read_vocab(vocab_filepath)
 
-    @property
-    def manifest(self):
-        return self._manifest
-    
-    @property
-    def vocab_size(self):
-        """Return the vocabulary size.
-
-        Returns:
-            int: Vocabulary size.
-        """
-        return len(self._vocab_list)
-
-    @property
-    def vocab_list(self):
-        """Return the vocabulary in list.
-
-        Returns:
-            List[str]: 
-        """
-        return self._vocab_list
-
-    @property
-    def vocab_dict(self):
-        """Return the vocabulary in dict.
-
-        Returns:
-            Dict[str, int]: 
-        """
-        vocab_dict = dict(
-            [(token, idx) for (idx, token) in enumerate(self._vocab_list)])
-        return vocab_dict
-
-    @property
-    def feature_size(self):
-        """Return the audio feature size.
-
-        Returns:
-            int: audio feature size.
-        """
-        return self._manifest[0]["feat_shape"][-1]
 
+    # @property
+    # def manifest(self):
+    #     return self._manifest
     
+    # @property
+    # def vocab_size(self):
+    #     """Return the vocabulary size.
+
+    #     Returns:
+    #         int: Vocabulary size.
+    #     """
+    #     return len(self._vocab_list)
+
+    # @property
+    # def vocab_list(self):
+    #     """Return the vocabulary in list.
+
+    #     Returns:
+    #         List[str]: 
+    #     """
+    #     return self._vocab_list
+
+    # @property
+    # def vocab_dict(self):
+    #     """Return the vocabulary in dict.
+
+    #     Returns:
+    #         Dict[str, int]: 
+    #     """
+    #     vocab_dict = dict(
+    #         [(token, idx) for (idx, token) in enumerate(self._vocab_list)])
+    #     return vocab_dict
+
+    # @property
+    # def feature_size(self):
+    #     """Return the audio feature size.
+
+    #     Returns:
+    #         int: audio feature size.
+    #     """
+    #     return self._manifest[0]["feat_shape"][-1]
+
 
     def __len__(self):
         return len(self._manifest)
diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml
index 8b08ee30..e5ab8e04 100644
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@@ -5,7 +5,6 @@ data:
   test_manifest: data/manifest.test
   mean_std_filepath: data/mean_std.json
   vocab_filepath: data/vocab.txt 
-  augmentation_config: conf/augmentation.json
   batch_size: 64 # one gpu
   min_input_len: 0.0
   max_input_len: 27.0 # second
@@ -13,21 +12,26 @@ data:
   max_output_len: .inf
   min_output_input_ratio: 0.00
   max_output_input_ratio: .inf
+  sortagrad: True
+  shuffle_method: batch_shuffle
+  num_workers: 0
+
+collator:
+  augmentation_config: conf/augmentation.json
+  random_seed: 0
+  spm_model_prefix: 
   specgram_type: linear
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
+  feat_dim: 
+  delta_delta: False
   stride_ms: 10.0
   window_ms: 20.0
-  delta_delta: False
-  dither: 1.0
+  n_fft: None
+  max_freq: None
+  target_sample_rate: 16000
   use_dB_normalization: True
   target_dB: -20
-  random_seed: 0
+  dither: 1.0
   keep_transcription_text: False
-  sortagrad: True
-  shuffle_method: batch_shuffle
-  num_workers: 0
 
 model:
   num_conv_layers: 2
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml
index bfed8d59..6680e568 100644
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -42,7 +42,7 @@ model:
   share_rnn_weights: True 
 
 training:
-  n_epoch: 21
+  n_epoch: 23
   lr: 1e-5 
   lr_decay: 1.0 
   weight_decay: 1e-06
diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml
index cc172585..5e28e4e8 100644
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -72,7 +72,7 @@ model:
 
 
 training:
-  n_epoch: 3
+  n_epoch: 21
   accum_grad: 1
   global_grad_clip: 5.0
   optim: adam

From 89a00eabeb6aaf0512be2283a563d087423c23bd Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Thu, 17 Jun 2021 00:36:57 +0000
Subject: [PATCH 09/14] revise deepspeech/exps/u2/model.py

---
 deepspeech/exps/u2/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 676768ce..164903e6 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -424,7 +424,7 @@ class U2Tester(U2Trainer):
         self.model.eval()
         logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
 
-        stride_ms = self.config.collator.stride_ms
+        stride_ms = self.test_loader.collate_fn.stride_ms
         error_rate_type = None
         errors_sum, len_refs, num_ins = 0.0, 0, 0
         num_frames = 0.0

From 698d7a9bdb3de1a763ed8ba7a71b68241e3eea17 Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Thu, 17 Jun 2021 07:16:52 +0000
Subject: [PATCH 10/14] move batch_size, work_nums, shuffle_method, sortagrad
 to collator

---
 deepspeech/exps/deepspeech2/config.py         | 20 +++++------------
 deepspeech/exps/deepspeech2/model.py          | 18 +++++++--------
 deepspeech/exps/u2/config.py                  |  6 ++++-
 .../frontend/featurizer/speech_featurizer.py  | 10 ---------
 deepspeech/io/collator.py                     | 22 -------------------
 examples/aishell/s0/conf/deepspeech2.yaml     |  9 ++++----
 examples/tiny/s0/conf/deepspeech2.yaml        |  9 ++++----
 7 files changed, 29 insertions(+), 65 deletions(-)

diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py
index 1ce5346f..faaff1aa 100644
--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
@@ -28,20 +28,6 @@ _C.data = CN(
         augmentation_config="",
         max_duration=float('inf'),
         min_duration=0.0,
-        stride_ms=10.0,  # ms
-        window_ms=20.0,  # ms
-        n_fft=None,  # fft points
-        max_freq=None,  # None for samplerate/2
-        specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-        feat_dim=0,  # 'mfcc', 'fbank'
-        delat_delta=False,  # 'mfcc', 'fbank'
-        target_sample_rate=16000,  # target sample rate
-        use_dB_normalization=True,
-        target_dB=-20,
-        batch_size=32,  # batch size
-        num_workers=0,  # data loader workers
-        sortagrad=False,  # sorted in first epoch when True
-        shuffle_method="batch_shuffle",  # 'batch_shuffle', 'instance_shuffle'
     ))
 
 _C.model = CN(
@@ -72,7 +58,11 @@ _C.collator =CN(
         use_dB_normalization=True,
         target_dB=-20,
         dither=1.0,  # feature dither
-        keep_transcription_text=False
+        keep_transcription_text=False,
+        batch_size=32,  # batch size
+        num_workers=0,  # data loader workers
+        sortagrad=False,  # sorted in first epoch when True
+        shuffle_method="batch_shuffle",  # 'batch_shuffle', 'instance_shuffle'
     ))
 
 DeepSpeech2Model.params(_C.model)
diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index 5833382a..b54192dd 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -55,7 +55,7 @@ class DeepSpeech2Trainer(Trainer):
             'train_loss': float(loss),
         }
         msg += "train time: {:>.3f}s, ".format(iteration_time)
-        msg += "batch size: {}, ".format(self.config.data.batch_size)
+        msg += "batch size: {}, ".format(self.config.collator.batch_size)
         msg += ', '.join('{}: {:>.6f}'.format(k, v)
                          for k, v in losses_np.items())
         logger.info(msg)
@@ -149,31 +149,31 @@ class DeepSpeech2Trainer(Trainer):
         if self.parallel:
             batch_sampler = SortagradDistributedBatchSampler(
                 train_dataset,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                 num_replicas=None,
                 rank=None,
                 shuffle=True,
                 drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
         else:
             batch_sampler = SortagradBatchSampler(
                 train_dataset,
                 shuffle=True,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                 drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
 
         collate_fn = SpeechCollator.from_config(config)
         self.train_loader = DataLoader(
             train_dataset,
             batch_sampler=batch_sampler,
             collate_fn=collate_fn,
-            num_workers=config.data.num_workers)
+            num_workers=config.collator.num_workers)
         self.valid_loader = DataLoader(
             dev_dataset,
-            batch_size=config.data.batch_size,
+            batch_size=config.collator.batch_size,
             shuffle=False,
             drop_last=False,
             collate_fn=collate_fn)
diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py
index 19080be7..42725c74 100644
--- a/deepspeech/exps/u2/config.py
+++ b/deepspeech/exps/u2/config.py
@@ -26,7 +26,11 @@ _C.collator =CfgNode(
     dict(
         augmentation_config="",
         unit_type="char",
-        keep_transcription_text=False
+        keep_transcription_text=False,
+        batch_size=32,  # batch size
+        num_workers=0,  # data loader workers
+        sortagrad=False,  # sorted in first epoch when True
+        shuffle_method="batch_shuffle"  # 'batch_shuffle', 'instance_shuffle'
     ))
 
 _C.model = U2Model.params()
diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py
index 852d26c9..0fbbc564 100644
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@@ -151,13 +151,3 @@ class SpeechFeaturizer(object):
             TextFeaturizer: object.
         """
         return self._text_featurizer
-        
-
-    # @property
-    # def text_feature(self):
-    #     """Return the text feature object.
-
-    #     Returns:
-    #         TextFeaturizer: object.
-    #     """
-    #     return self._text_featurizer
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index 8b8575db..ac817a19 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -203,34 +203,22 @@ class SpeechCollator():
                  where transcription part could be token ids or text.
         :rtype: tuple of (2darray, list)
         """
-        start_time = time.time()
         if isinstance(audio_file, str) and audio_file.startswith('tar:'):
             speech_segment = SpeechSegment.from_file(
                 self._subfile_from_tar(audio_file), transcript)
         else:
             speech_segment = SpeechSegment.from_file(audio_file, transcript)
-        load_wav_time = time.time() - start_time
-        #logger.debug(f"load wav time: {load_wav_time}")
 
         # audio augment
-        start_time = time.time()
         self._augmentation_pipeline.transform_audio(speech_segment)
-        audio_aug_time = time.time() - start_time
-        #logger.debug(f"audio augmentation time: {audio_aug_time}")
 
-        start_time = time.time()
         specgram, transcript_part = self._speech_featurizer.featurize(
             speech_segment, self._keep_transcription_text)
         if self._normalizer:
             specgram = self._normalizer.apply(specgram)
-        feature_time = time.time() - start_time
-        #logger.debug(f"audio & test feature time: {feature_time}")
 
         # specgram augment
-        start_time = time.time()
         specgram = self._augmentation_pipeline.transform_feature(specgram)
-        feature_aug_time = time.time() - start_time
-        #logger.debug(f"audio feature augmentation time: {feature_aug_time}")
         return specgram, transcript_part
 
     def __call__(self, batch):
@@ -283,16 +271,6 @@ class SpeechCollator():
         return utts, padded_audios, audio_lens, padded_texts, text_lens
 
 
-    # @property
-    # def text_feature(self):
-    #     return self._speech_featurizer.text_feature
-
-
-    # @property
-    # def stride_ms(self):
-    #     return self._speech_featurizer.stride_ms
-
-###########
     
     @property
     def manifest(self):
diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml
index e5ab8e04..54ce240e 100644
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@@ -5,16 +5,13 @@ data:
   test_manifest: data/manifest.test
   mean_std_filepath: data/mean_std.json
   vocab_filepath: data/vocab.txt 
-  batch_size: 64 # one gpu
   min_input_len: 0.0
   max_input_len: 27.0 # second
   min_output_len: 0.0
   max_output_len: .inf
   min_output_input_ratio: 0.00
   max_output_input_ratio: .inf
-  sortagrad: True
-  shuffle_method: batch_shuffle
-  num_workers: 0
+
 
 collator:
   augmentation_config: conf/augmentation.json
@@ -32,6 +29,10 @@ collator:
   target_dB: -20
   dither: 1.0
   keep_transcription_text: False
+  sortagrad: True
+  shuffle_method: batch_shuffle
+  num_workers: 0
+  batch_size: 64 # one gpu
 
 model:
   num_conv_layers: 2
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml
index 6680e568..434cf264 100644
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -6,16 +6,13 @@ data:
   mean_std_filepath: data/mean_std.json
   unit_type: char
   vocab_filepath: data/vocab.txt 
-  batch_size: 4
   min_input_len: 0.0
   max_input_len: 27.0
   min_output_len: 0.0
   max_output_len: 400.0
   min_output_input_ratio: 0.05
   max_output_input_ratio: 10.0
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 0
+
 
 collator:
   augmentation_config: conf/augmentation.json
@@ -33,6 +30,10 @@ collator:
   target_dB: -20
   dither: 1.0
   keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 0
+  batch_size: 4
   
 model:
   num_conv_layers: 2

From 557427736e9f2fba6715cc3ce18b3175a3c42cd8 Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Fri, 18 Jun 2021 06:41:28 +0000
Subject: [PATCH 11/14] move redundant params

---
 deepspeech/exps/deepspeech2/config.py   |  30 +++----
 deepspeech/exps/deepspeech2/model.py    |  14 ++--
 deepspeech/exps/u2/config.py            |  12 +--
 deepspeech/exps/u2/model.py             |  35 ++++----
 deepspeech/io/collator.py               |  36 ++++++--
 deepspeech/io/dataset.py                | 105 +-----------------------
 examples/aishell/s1/conf/conformer.yaml |  14 ++--
 examples/tiny/s0/conf/deepspeech2.yaml  |  10 +--
 examples/tiny/s1/conf/transformer.yaml  |  22 ++---
 9 files changed, 96 insertions(+), 182 deletions(-)

diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py
index faaff1aa..050a50b0 100644
--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
@@ -21,32 +21,18 @@ _C.data = CN(
         train_manifest="",
         dev_manifest="",
         test_manifest="",
-        unit_type="char",
-        vocab_filepath="",
-        spm_model_prefix="",
-        mean_std_filepath="",
-        augmentation_config="",
         max_duration=float('inf'),
         min_duration=0.0,
     ))
 
-_C.model = CN(
-    dict(
-        num_conv_layers=2,  #Number of stacking convolution layers.
-        num_rnn_layers=3,  #Number of stacking RNN layers.
-        rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
-        use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-        share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-    ))
-
 _C.collator =CN(
     dict(
-        augmentation_config="",
-        random_seed=0,
-        mean_std_filepath="",
         unit_type="char",
         vocab_filepath="",
         spm_model_prefix="",
+        mean_std_filepath="",
+        augmentation_config="",
+        random_seed=0,
         specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
         feat_dim=0,  # 'mfcc', 'fbank'
         delta_delta=False,  # 'mfcc', 'fbank'
@@ -65,6 +51,16 @@ _C.collator =CN(
         shuffle_method="batch_shuffle",  # 'batch_shuffle', 'instance_shuffle'
     ))
 
+_C.model = CN(
+    dict(
+        num_conv_layers=2,  #Number of stacking convolution layers.
+        num_rnn_layers=3,  #Number of stacking RNN layers.
+        rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
+        use_gru=True,  #Use gru if set True. Use simple rnn if set False.
+        share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
+    ))
+
+
 DeepSpeech2Model.params(_C.model)
 
 _C.training = CN(
diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index b54192dd..1eefc871 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -143,7 +143,6 @@ class DeepSpeech2Trainer(Trainer):
         train_dataset = ManifestDataset.from_config(config)
 
         config.data.manifest = config.data.dev_manifest
-        config.data.augmentation_config = ""
         dev_dataset = ManifestDataset.from_config(config)
 
         if self.parallel:
@@ -165,18 +164,22 @@ class DeepSpeech2Trainer(Trainer):
                 sortagrad=config.collator.sortagrad,
                 shuffle_method=config.collator.shuffle_method)
 
-        collate_fn = SpeechCollator.from_config(config)
+        collate_fn_train = SpeechCollator.from_config(config)
+
+
+        config.collator.augmentation_config = ""
+        collate_fn_dev = SpeechCollator.from_config(config)
         self.train_loader = DataLoader(
             train_dataset,
             batch_sampler=batch_sampler,
-            collate_fn=collate_fn,
+            collate_fn=collate_fn_train,
             num_workers=config.collator.num_workers)
         self.valid_loader = DataLoader(
             dev_dataset,
             batch_size=config.collator.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=collate_fn)
+            collate_fn=collate_fn_dev)
         logger.info("Setup train/valid Dataloader!")
 
 
@@ -324,8 +327,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
         # return raw text
 
         config.data.manifest = config.data.test_manifest
-        config.data.keep_transcription_text = True
-        config.data.augmentation_config = ""
         # filter test examples, will cause less examples, but no mismatch with training
         # and can use large batch size , save training time, so filter test egs now.
         # config.data.min_input_len = 0.0  # second
@@ -337,6 +338,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
         test_dataset = ManifestDataset.from_config(config)
 
         config.collator.keep_transcription_text = True
+        config.collator.augmentation_config = ""
         # return text ord id
         self.test_loader = DataLoader(
             test_dataset,
diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py
index 42725c74..d8735453 100644
--- a/deepspeech/exps/u2/config.py
+++ b/deepspeech/exps/u2/config.py
@@ -17,21 +17,13 @@ from deepspeech.exps.u2.model import U2Tester
 from deepspeech.exps.u2.model import U2Trainer
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.u2 import U2Model
+from deepspeech.io.collator import SpeechCollator
 
 _C = CfgNode()
 
 _C.data = ManifestDataset.params()
 
-_C.collator =CfgNode(
-    dict(
-        augmentation_config="",
-        unit_type="char",
-        keep_transcription_text=False,
-        batch_size=32,  # batch size
-        num_workers=0,  # data loader workers
-        sortagrad=False,  # sorted in first epoch when True
-        shuffle_method="batch_shuffle"  # 'batch_shuffle', 'instance_shuffle'
-    ))
+_C.collator = SpeechCollator.params()
 
 _C.model = U2Model.params()
 
diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 164903e6..836afa36 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -100,7 +100,7 @@ class U2Trainer(Trainer):
 
         if (batch_index + 1) % train_conf.log_interval == 0:
             msg += "train time: {:>.3f}s, ".format(iteration_time)
-            msg += "batch size: {}, ".format(self.config.data.batch_size)
+            msg += "batch size: {}, ".format(self.config.collator.batch_size)
             msg += "accum: {}, ".format(train_conf.accum_grad)
             msg += ', '.join('{}: {:>.6f}'.format(k, v)
                              for k, v in losses_np.items())
@@ -211,51 +211,52 @@ class U2Trainer(Trainer):
     def setup_dataloader(self):
         config = self.config.clone()
         config.defrost()
-        config.data.keep_transcription_text = False
+        config.collator.keep_transcription_text = False
 
         # train/valid dataset, return token ids
         config.data.manifest = config.data.train_manifest
         train_dataset = ManifestDataset.from_config(config)
 
         config.data.manifest = config.data.dev_manifest
-        config.data.augmentation_config = ""
         dev_dataset = ManifestDataset.from_config(config)
 
-        collate_fn = SpeechCollator.from_config(config)
+        collate_fn_train = SpeechCollator.from_config(config)
+        
+        config.collator.augmentation_config = ""
+        collate_fn_dev = SpeechCollator.from_config(config)
+
         if self.parallel:
             batch_sampler = SortagradDistributedBatchSampler(
                 train_dataset,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                 num_replicas=None,
                 rank=None,
                 shuffle=True,
                 drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
         else:
             batch_sampler = SortagradBatchSampler(
                 train_dataset,
                 shuffle=True,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                 drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
         self.train_loader = DataLoader(
             train_dataset,
             batch_sampler=batch_sampler,
-            collate_fn=collate_fn,
-            num_workers=config.data.num_workers, )
+            collate_fn=collate_fn_train,
+            num_workers=config.collator.num_workers, )
         self.valid_loader = DataLoader(
             dev_dataset,
-            batch_size=config.data.batch_size,
+            batch_size=config.collator.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=collate_fn)
+            collate_fn=collate_fn_dev)
 
         # test dataset, return raw text
         config.data.manifest = config.data.test_manifest
-        config.data.keep_transcription_text = True
-        config.data.augmentation_config = ""
         # filter test examples, will cause less examples, but no mismatch with training
         # and can use large batch size , save training time, so filter test egs now.
         # config.data.min_input_len = 0.0  # second
@@ -264,9 +265,11 @@ class U2Trainer(Trainer):
         # config.data.max_output_len = float('inf')  # tokens
         # config.data.min_output_input_ratio = 0.00
         # config.data.max_output_input_ratio = float('inf')
+
         test_dataset = ManifestDataset.from_config(config)
         # return text ord id
         config.collator.keep_transcription_text = True
+        config.collator.augmentation_config = ""
         self.test_loader = DataLoader(
             test_dataset,
             batch_size=config.decoding.batch_size,
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index ac817a19..ab1e9165 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -75,8 +75,8 @@ class SpeechCollator():
         """
         assert 'augmentation_config' in config.collator
         assert 'keep_transcription_text' in config.collator
-        assert 'mean_std_filepath' in config.data
-        assert 'vocab_filepath' in config.data
+        assert 'mean_std_filepath' in config.collator
+        assert 'vocab_filepath' in config.collator
         assert 'specgram_type' in config.collator
         assert 'n_fft' in config.collator
         assert config.collator
@@ -94,9 +94,9 @@ class SpeechCollator():
         speech_collator = cls(
                 aug_file=aug_file,
                 random_seed=0,
-                mean_std_filepath=config.data.mean_std_filepath,
+                mean_std_filepath=config.collator.mean_std_filepath,
                 unit_type=config.collator.unit_type,
-                vocab_filepath=config.data.vocab_filepath,
+                vocab_filepath=config.collator.vocab_filepath,
                 spm_model_prefix=config.collator.spm_model_prefix,
                 specgram_type=config.collator.specgram_type, 
                 feat_dim=config.collator.feat_dim, 
@@ -129,11 +129,31 @@ class SpeechCollator():
                 target_dB=-20,
                 dither=1.0,
                 keep_transcription_text=True):
-        """
-        Padding audio features with zeros to make them have the same shape (or
-        a user-defined shape) within one bach.
+        """SpeechCollator Collator
 
-        if ``keep_transcription_text`` is False, text is token ids else is raw string.
+        Args:
+            unit_type(str): token unit type, e.g. char, word, spm
+            vocab_filepath (str): vocab file path.
+            mean_std_filepath (str): mean and std file path, which suffix is *.npy
+            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
+            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
+            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
+            window_ms (float, optional): window size in ms. Defaults to 20.0.
+            n_fft (int, optional): fft points for rfft. Defaults to None.
+            max_freq (int, optional): max cut freq. Defaults to None.
+            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
+            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
+            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
+            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
+            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
+            target_dB (int, optional): target dB. Defaults to -20.
+            random_seed (int, optional): for random generator. Defaults to 0.
+            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
+            if ``keep_transcription_text`` is False, text is token ids else is raw string.
+        
+        Do augmentations 
+        Padding audio features with zeros to make them have the same shape (or
+        a user-defined shape) within one batch.
         """
         self._keep_transcription_text = keep_transcription_text
 
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index 24d8486a..70383b4d 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -40,15 +40,7 @@ class ManifestDataset(Dataset):
     def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
         default = CfgNode(
             dict(
-                train_manifest="",
-                dev_manifest="",
-                test_manifest="",
                 manifest="",
-                unit_type="char",
-                vocab_filepath="",
-                spm_model_prefix="",
-                mean_std_filepath="",
-                augmentation_config="",
                 max_input_len=27.0,
                 min_input_len=0.0,
                 max_output_len=float('inf'),
@@ -73,25 +65,10 @@ class ManifestDataset(Dataset):
         """
         assert 'manifest' in config.data
         assert config.data.manifest
-        assert 'keep_transcription_text' in config.collator
-
-        if isinstance(config.data.augmentation_config, (str, bytes)):
-            if config.data.augmentation_config:
-                aug_file = io.open(
-                    config.data.augmentation_config, mode='r', encoding='utf8')
-            else:
-                aug_file = io.StringIO(initial_value='{}', newline='')
-        else:
-            aug_file = config.data.augmentation_config
-            assert isinstance(aug_file, io.StringIO)
+
 
         dataset = cls(
             manifest_path=config.data.manifest,
-            unit_type=config.data.unit_type,
-            vocab_filepath=config.data.vocab_filepath,
-            mean_std_filepath=config.data.mean_std_filepath,
-            spm_model_prefix=config.data.spm_model_prefix,
-            augmentation_config=aug_file.read(),
             max_input_len=config.data.max_input_len,
             min_input_len=config.data.min_input_len,
             max_output_len=config.data.max_output_len,
@@ -101,23 +78,8 @@ class ManifestDataset(Dataset):
             )
         return dataset
 
-    
-    def _read_vocab(self, vocab_filepath):
-        """Load vocabulary from file."""
-        vocab_lines = []
-        with open(vocab_filepath, 'r', encoding='utf-8') as file:
-            vocab_lines.extend(file.readlines())
-        vocab_list = [line[:-1] for line in vocab_lines]
-        return vocab_list
-
-
     def __init__(self,
                  manifest_path,
-                 unit_type,
-                 vocab_filepath,
-                 mean_std_filepath,
-                 spm_model_prefix=None,
-                 augmentation_config='{}',
                  max_input_len=float('inf'),
                  min_input_len=0.0,
                  max_output_len=float('inf'),
@@ -128,34 +90,16 @@ class ManifestDataset(Dataset):
 
         Args:
             manifest_path (str): manifest josn file path
-            unit_type(str): token unit type, e.g. char, word, spm
-            vocab_filepath (str): vocab file path.
-            mean_std_filepath (str): mean and std file path, which suffix is *.npy
-            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
-            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
             max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
             min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
             max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
             min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
             max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
             min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
-            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
-            window_ms (float, optional): window size in ms. Defaults to 20.0.
-            n_fft (int, optional): fft points for rfft. Defaults to None.
-            max_freq (int, optional): max cut freq. Defaults to None.
-            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
-            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
-            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
-            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
-            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
-            target_dB (int, optional): target dB. Defaults to -20.
-            random_seed (int, optional): for random generator. Defaults to 0.
-            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
+        
         """
         super().__init__()
 
-        # self._rng = np.random.RandomState(random_seed)
-
         # read manifest
         self._manifest = read_manifest(
             manifest_path=manifest_path,
@@ -167,51 +111,6 @@ class ManifestDataset(Dataset):
             min_output_input_ratio=min_output_input_ratio)
         self._manifest.sort(key=lambda x: x["feat_shape"][0])
 
-        # self._vocab_list = self._read_vocab(vocab_filepath)
-
-
-    # @property
-    # def manifest(self):
-    #     return self._manifest
-    
-    # @property
-    # def vocab_size(self):
-    #     """Return the vocabulary size.
-
-    #     Returns:
-    #         int: Vocabulary size.
-    #     """
-    #     return len(self._vocab_list)
-
-    # @property
-    # def vocab_list(self):
-    #     """Return the vocabulary in list.
-
-    #     Returns:
-    #         List[str]: 
-    #     """
-    #     return self._vocab_list
-
-    # @property
-    # def vocab_dict(self):
-    #     """Return the vocabulary in dict.
-
-    #     Returns:
-    #         Dict[str, int]: 
-    #     """
-    #     vocab_dict = dict(
-    #         [(token, idx) for (idx, token) in enumerate(self._vocab_list)])
-    #     return vocab_dict
-
-    # @property
-    # def feature_size(self):
-    #     """Return the audio feature size.
-
-    #     Returns:
-    #         int: audio feature size.
-    #     """
-    #     return self._manifest[0]["feat_shape"][-1]
-
 
     def __len__(self):
         return len(self._manifest)
diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml
index b880f858..116c9192 100644
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@@ -3,17 +3,20 @@ data:
   train_manifest: data/manifest.train
   dev_manifest: data/manifest.dev
   test_manifest: data/manifest.test
-  vocab_filepath: data/vocab.txt 
-  unit_type: 'char'
-  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
-  batch_size: 64
   min_input_len: 0.5
   max_input_len: 20.0 # second
   min_output_len: 0.0
   max_output_len: 400.0
   min_output_input_ratio: 0.05
   max_output_input_ratio: 10.0
+
+
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'char'
+  spm_model_prefix: ''
+  augmentation_config: conf/augmentation.json
+  batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
   specgram_type: fbank #linear, mfcc, fbank
   feat_dim: 80
@@ -32,7 +35,6 @@ data:
   shuffle_method: batch_shuffle
   num_workers: 2
 
-
 # network architecture
 model:
     cmvn_file: "data/mean_std.json"
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml
index 434cf264..6737d1b7 100644
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -2,10 +2,7 @@
 data:
   train_manifest: data/manifest.tiny
   dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
-  mean_std_filepath: data/mean_std.json
-  unit_type: char
-  vocab_filepath: data/vocab.txt 
+  test_manifest: data/manifest.tiny 
   min_input_len: 0.0
   max_input_len: 27.0
   min_output_len: 0.0
@@ -15,6 +12,9 @@ data:
 
 
 collator:
+  mean_std_filepath: data/mean_std.json
+  unit_type: char
+  vocab_filepath: data/vocab.txt
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
@@ -43,7 +43,7 @@ model:
   share_rnn_weights: True 
 
 training:
-  n_epoch: 23
+  n_epoch: 24
   lr: 1e-5 
   lr_decay: 1.0 
   weight_decay: 1e-06
diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml
index 5e28e4e8..250995fa 100644
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -3,26 +3,20 @@ data:
   train_manifest: data/manifest.tiny
   dev_manifest: data/manifest.tiny
   test_manifest: data/manifest.tiny
-  vocab_filepath: data/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
-  mean_std_filepath: ""
-  batch_size: 4
   min_input_len: 0.5  # second
   max_input_len: 20.0 # second
   min_output_len: 0.0 # tokens
   max_output_len: 400.0 # tokens
   min_output_input_ratio: 0.05
   max_output_input_ratio: 10.0
-  raw_wav: True  # use raw_wav or kaldi feature
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 0 #2
-
+  
 collator:
+  vocab_filepath: data/vocab.txt 
+  mean_std_filepath: ""
   augmentation_config: conf/augmentation.json
   random_seed: 0
-  spm_model_prefix: 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_200'
   specgram_type: fbank
   feat_dim: 80
   delta_delta: False
@@ -35,6 +29,12 @@ collator:
   target_dB: -20
   dither: 1.0
   keep_transcription_text: False
+  batch_size: 4
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 0 #2
+  raw_wav: True  # use raw_wav or kaldi feature
+
 
 # network architecture
 model:

From 089a8ed602721acf43c676b37249987ebd8bfa3b Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Fri, 18 Jun 2021 09:47:53 +0000
Subject: [PATCH 12/14] fix deepspeech2/model.py and deepspeech2/config.py

---
 deepspeech/exps/deepspeech2/config.py | 76 ++++-----------------------
 deepspeech/exps/deepspeech2/model.py  | 39 ++++++++++++++
 2 files changed, 50 insertions(+), 65 deletions(-)

diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py
index 050a50b0..7d2250fc 100644
--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
@@ -11,80 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from yacs.config import CfgNode as CN
+from yacs.config import CfgNode
 
 from deepspeech.models.deepspeech2 import DeepSpeech2Model
+from deepspeech.io.dataset import ManifestDataset
+from deepspeech.io.collator import SpeechCollator
+from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer
+from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester
 
-_C = CN()
-_C.data = CN(
-    dict(
-        train_manifest="",
-        dev_manifest="",
-        test_manifest="",
-        max_duration=float('inf'),
-        min_duration=0.0,
-    ))
 
-_C.collator =CN(
-    dict(
-        unit_type="char",
-        vocab_filepath="",
-        spm_model_prefix="",
-        mean_std_filepath="",
-        augmentation_config="",
-        random_seed=0,
-        specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-        feat_dim=0,  # 'mfcc', 'fbank'
-        delta_delta=False,  # 'mfcc', 'fbank'
-        stride_ms=10.0,  # ms
-        window_ms=20.0,  # ms
-        n_fft=None,  # fft points
-        max_freq=None,  # None for samplerate/2
-        target_sample_rate=16000,  # target sample rate
-        use_dB_normalization=True,
-        target_dB=-20,
-        dither=1.0,  # feature dither
-        keep_transcription_text=False,
-        batch_size=32,  # batch size
-        num_workers=0,  # data loader workers
-        sortagrad=False,  # sorted in first epoch when True
-        shuffle_method="batch_shuffle",  # 'batch_shuffle', 'instance_shuffle'
-    ))
+_C = CfgNode()
 
-_C.model = CN(
-    dict(
-        num_conv_layers=2,  #Number of stacking convolution layers.
-        num_rnn_layers=3,  #Number of stacking RNN layers.
-        rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
-        use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-        share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-    ))
+_C.data = ManifestDataset.params()
 
+_C.collator = SpeechCollator.params()
 
-DeepSpeech2Model.params(_C.model)
+_C.model = DeepSpeech2Model.params()
 
-_C.training = CN(
-    dict(
-        lr=5e-4,  # learning rate
-        lr_decay=1.0,  # learning rate decay
-        weight_decay=1e-6,  # the coeff of weight decay
-        global_grad_clip=5.0,  # the global norm clip
-        n_epoch=50,  # train epochs
-    ))
+_C.training = DeepSpeech2Trainer.params()
 
-_C.decoding = CN(
-    dict(
-        alpha=2.5,  # Coef of LM for beam search.
-        beta=0.3,  # Coef of WC for beam search.
-        cutoff_prob=1.0,  # Cutoff probability for pruning.
-        cutoff_top_n=40,  # Cutoff number for pruning.
-        lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
-        decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
-        error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
-        num_proc_bsearch=8,  # # of CPUs for beam search.
-        beam_size=500,  # Beam search width.
-        batch_size=128,  # decoding batch size
-    ))
+_C.decoding = DeepSpeech2Tester.params()
 
 
 def get_cfg_defaults():
diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index 1eefc871..c11d1e25 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -34,10 +34,28 @@ from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
 from deepspeech.utils.log import Log
 
+from typing import Optional
+from yacs.config import CfgNode
 logger = Log(__name__).getlog()
 
 
 class DeepSpeech2Trainer(Trainer):
+    @classmethod
+    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
+        # training config
+        default = CfgNode(
+                    dict(
+                        lr=5e-4,  # learning rate
+                        lr_decay=1.0,  # learning rate decay
+                        weight_decay=1e-6,  # the coeff of weight decay
+                        global_grad_clip=5.0,  # the global norm clip
+                        n_epoch=50,  # train epochs
+                    ))
+
+        if config is not None:
+            config.merge_from_other_cfg(default)
+        return default
+
     def __init__(self, config, args):
         super().__init__(config, args)
 
@@ -184,6 +202,27 @@ class DeepSpeech2Trainer(Trainer):
 
 
 class DeepSpeech2Tester(DeepSpeech2Trainer):
+    @classmethod
+    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
+        # testing config
+        default = CfgNode(
+                dict(
+                    alpha=2.5,  # Coef of LM for beam search.
+                    beta=0.3,  # Coef of WC for beam search.
+                    cutoff_prob=1.0,  # Cutoff probability for pruning.
+                    cutoff_top_n=40,  # Cutoff number for pruning.
+                    lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
+                    decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
+                    error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
+                    num_proc_bsearch=8,  # # of CPUs for beam search.
+                    beam_size=500,  # Beam search width.
+                    batch_size=128,  # decoding batch size
+                ))
+
+        if config is not None:
+            config.merge_from_other_cfg(default)
+        return default
+
     def __init__(self, config, args):
         super().__init__(config, args)
 

From 3a743f3717f692ff9cdbbcb24244fbc8ae5ce93b Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Fri, 18 Jun 2021 10:09:35 +0000
Subject: [PATCH 13/14] fix pre-commit

---
 deepspeech/exps/deepspeech2/config.py |   9 +--
 deepspeech/exps/deepspeech2/model.py  |  58 +++++++-------
 deepspeech/exps/u2/config.py          |   2 +-
 deepspeech/exps/u2/model.py           |  19 +++--
 deepspeech/io/collator.py             | 108 +++++++++++++-------------
 deepspeech/io/dataset.py              |  16 +---
 deepspeech/models/u2.py               |   1 -
 7 files changed, 108 insertions(+), 105 deletions(-)

diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py
index 7d2250fc..2f0f5c24 100644
--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
@@ -13,12 +13,11 @@
 # limitations under the License.
 from yacs.config import CfgNode
 
-from deepspeech.models.deepspeech2 import DeepSpeech2Model
-from deepspeech.io.dataset import ManifestDataset
-from deepspeech.io.collator import SpeechCollator
-from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer
 from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester
-
+from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer
+from deepspeech.io.collator import SpeechCollator
+from deepspeech.io.dataset import ManifestDataset
+from deepspeech.models.deepspeech2 import DeepSpeech2Model
 
 _C = CfgNode()
 
diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index c11d1e25..deb8752b 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -15,11 +15,13 @@
 import time
 from collections import defaultdict
 from pathlib import Path
+from typing import Optional
 
 import numpy as np
 import paddle
 from paddle import distributed as dist
 from paddle.io import DataLoader
+from yacs.config import CfgNode
 
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
@@ -33,9 +35,6 @@ from deepspeech.utils import error_rate
 from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
 from deepspeech.utils.log import Log
-
-from typing import Optional
-from yacs.config import CfgNode
 logger = Log(__name__).getlog()
 
 
@@ -44,13 +43,13 @@ class DeepSpeech2Trainer(Trainer):
     def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
         # training config
         default = CfgNode(
-                    dict(
-                        lr=5e-4,  # learning rate
-                        lr_decay=1.0,  # learning rate decay
-                        weight_decay=1e-6,  # the coeff of weight decay
-                        global_grad_clip=5.0,  # the global norm clip
-                        n_epoch=50,  # train epochs
-                    ))
+            dict(
+                lr=5e-4,  # learning rate
+                lr_decay=1.0,  # learning rate decay
+                weight_decay=1e-6,  # the coeff of weight decay
+                global_grad_clip=5.0,  # the global norm clip
+                n_epoch=50,  # train epochs
+            ))
 
         if config is not None:
             config.merge_from_other_cfg(default)
@@ -184,7 +183,6 @@ class DeepSpeech2Trainer(Trainer):
 
         collate_fn_train = SpeechCollator.from_config(config)
 
-
         config.collator.augmentation_config = ""
         collate_fn_dev = SpeechCollator.from_config(config)
         self.train_loader = DataLoader(
@@ -206,18 +204,18 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
     def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
         # testing config
         default = CfgNode(
-                dict(
-                    alpha=2.5,  # Coef of LM for beam search.
-                    beta=0.3,  # Coef of WC for beam search.
-                    cutoff_prob=1.0,  # Cutoff probability for pruning.
-                    cutoff_top_n=40,  # Cutoff number for pruning.
-                    lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
-                    decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
-                    error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
-                    num_proc_bsearch=8,  # # of CPUs for beam search.
-                    beam_size=500,  # Beam search width.
-                    batch_size=128,  # decoding batch size
-                ))
+            dict(
+                alpha=2.5,  # Coef of LM for beam search.
+                beta=0.3,  # Coef of WC for beam search.
+                cutoff_prob=1.0,  # Cutoff probability for pruning.
+                cutoff_top_n=40,  # Cutoff number for pruning.
+                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
+                decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
+                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
+                num_proc_bsearch=8,  # # of CPUs for beam search.
+                beam_size=500,  # Beam search width.
+                batch_size=128,  # decoding batch size
+            ))
 
         if config is not None:
             config.merge_from_other_cfg(default)
@@ -235,7 +233,13 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
             trans.append(''.join([chr(i) for i in ids]))
         return trans
 
-    def compute_metrics(self, utts, audio, audio_len, texts, texts_len, fout = None):
+    def compute_metrics(self,
+                        utts,
+                        audio,
+                        audio_len,
+                        texts,
+                        texts_len,
+                        fout=None):
         cfg = self.config.decoding
         errors_sum, len_refs, num_ins = 0.0, 0, 0
         errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
@@ -257,7 +261,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
             cutoff_top_n=cfg.cutoff_top_n,
             num_processes=cfg.num_proc_bsearch)
 
-        for utt, target, result in zip(utts, target_transcripts, result_transcripts):
+        for utt, target, result in zip(utts, target_transcripts,
+                                       result_transcripts):
             errors, len_ref = errors_func(target, result)
             errors_sum += errors
             len_refs += len_ref
@@ -287,7 +292,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
         with open(self.args.result_file, 'w') as fout:
             for i, batch in enumerate(self.test_loader):
                 utts, audio, audio_len, texts, texts_len = batch
-                metrics = self.compute_metrics(utts, audio, audio_len, texts, texts_len, fout)
+                metrics = self.compute_metrics(utts, audio, audio_len, texts,
+                                               texts_len, fout)
                 errors_sum += metrics['errors_sum']
                 len_refs += metrics['len_refs']
                 num_ins += metrics['num_ins']
diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py
index d8735453..4ec7bd19 100644
--- a/deepspeech/exps/u2/config.py
+++ b/deepspeech/exps/u2/config.py
@@ -15,9 +15,9 @@ from yacs.config import CfgNode
 
 from deepspeech.exps.u2.model import U2Tester
 from deepspeech.exps.u2.model import U2Trainer
+from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.u2 import U2Model
-from deepspeech.io.collator import SpeechCollator
 
 _C = CfgNode()
 
diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 836afa36..05551875 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -78,7 +78,8 @@ class U2Trainer(Trainer):
         start = time.time()
         utt, audio, audio_len, text, text_len = batch_data
 
-        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, text_len)
+        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
+                                                    text_len)
         # loss div by `batch_size * accum_grad`
         loss /= train_conf.accum_grad
         loss.backward()
@@ -121,7 +122,8 @@ class U2Trainer(Trainer):
         total_loss = 0.0
         for i, batch in enumerate(self.valid_loader):
             utt, audio, audio_len, text, text_len = batch
-            loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, text_len)
+            loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
+                                                        text_len)
             if paddle.isfinite(loss):
                 num_utts = batch[1].shape[0]
                 num_seen_utts += num_utts
@@ -221,7 +223,7 @@ class U2Trainer(Trainer):
         dev_dataset = ManifestDataset.from_config(config)
 
         collate_fn_train = SpeechCollator.from_config(config)
-        
+
         config.collator.augmentation_config = ""
         collate_fn_dev = SpeechCollator.from_config(config)
 
@@ -372,7 +374,13 @@ class U2Tester(U2Trainer):
             trans.append(''.join([chr(i) for i in ids]))
         return trans
 
-    def compute_metrics(self, utts, audio, audio_len, texts, texts_len, fout=None):
+    def compute_metrics(self,
+                        utts,
+                        audio,
+                        audio_len,
+                        texts,
+                        texts_len,
+                        fout=None):
         cfg = self.config.decoding
         errors_sum, len_refs, num_ins = 0.0, 0, 0
         errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
@@ -399,7 +407,8 @@ class U2Tester(U2Trainer):
             simulate_streaming=cfg.simulate_streaming)
         decode_time = time.time() - start_time
 
-        for utt, target, result in zip(utts, target_transcripts, result_transcripts):
+        for utt, target, result in zip(utts, target_transcripts,
+                                       result_transcripts):
             errors, len_ref = errors_func(target, result)
             errors_sum += errors
             len_refs += len_ref
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index ab1e9165..ecf7024c 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -11,21 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import io
+import time
+from collections import namedtuple
+from typing import Optional
+
 import numpy as np
+from yacs.config import CfgNode
 
-from deepspeech.frontend.utility import IGNORE_ID
-from deepspeech.io.utility import pad_sequence
-from deepspeech.utils.log import Log
 from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
 from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
-import io
-import time
-from yacs.config import CfgNode
-from typing import Optional
-
-from collections import namedtuple
+from deepspeech.frontend.utility import IGNORE_ID
+from deepspeech.io.utility import pad_sequence
+from deepspeech.utils.log import Log
 
 __all__ = ["SpeechCollator"]
 
@@ -34,6 +34,7 @@ logger = Log(__name__).getlog()
 # namedtupe need global for pickle.
 TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
 
+
 class SpeechCollator():
     @classmethod
     def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
@@ -56,8 +57,7 @@ class SpeechCollator():
                 use_dB_normalization=True,
                 target_dB=-20,
                 dither=1.0,  # feature dither
-                keep_transcription_text=False
-            ))
+                keep_transcription_text=False))
 
         if config is not None:
             config.merge_from_other_cfg(default)
@@ -84,7 +84,9 @@ class SpeechCollator():
         if isinstance(config.collator.augmentation_config, (str, bytes)):
             if config.collator.augmentation_config:
                 aug_file = io.open(
-                    config.collator.augmentation_config, mode='r', encoding='utf8')
+                    config.collator.augmentation_config,
+                    mode='r',
+                    encoding='utf8')
             else:
                 aug_file = io.StringIO(initial_value='{}', newline='')
         else:
@@ -92,43 +94,46 @@ class SpeechCollator():
             assert isinstance(aug_file, io.StringIO)
 
         speech_collator = cls(
-                aug_file=aug_file,
-                random_seed=0,
-                mean_std_filepath=config.collator.mean_std_filepath,
-                unit_type=config.collator.unit_type,
-                vocab_filepath=config.collator.vocab_filepath,
-                spm_model_prefix=config.collator.spm_model_prefix,
-                specgram_type=config.collator.specgram_type, 
-                feat_dim=config.collator.feat_dim, 
-                delta_delta=config.collator.delta_delta, 
-                stride_ms=config.collator.stride_ms, 
-                window_ms=config.collator.window_ms, 
-                n_fft=config.collator.n_fft, 
-                max_freq=config.collator.max_freq, 
-                target_sample_rate=config.collator.target_sample_rate, 
-                use_dB_normalization=config.collator.use_dB_normalization,
-                target_dB=config.collator.target_dB,
-                dither=config.collator.dither, 
-                keep_transcription_text=config.collator.keep_transcription_text
-            )
+            aug_file=aug_file,
+            random_seed=0,
+            mean_std_filepath=config.collator.mean_std_filepath,
+            unit_type=config.collator.unit_type,
+            vocab_filepath=config.collator.vocab_filepath,
+            spm_model_prefix=config.collator.spm_model_prefix,
+            specgram_type=config.collator.specgram_type,
+            feat_dim=config.collator.feat_dim,
+            delta_delta=config.collator.delta_delta,
+            stride_ms=config.collator.stride_ms,
+            window_ms=config.collator.window_ms,
+            n_fft=config.collator.n_fft,
+            max_freq=config.collator.max_freq,
+            target_sample_rate=config.collator.target_sample_rate,
+            use_dB_normalization=config.collator.use_dB_normalization,
+            target_dB=config.collator.target_dB,
+            dither=config.collator.dither,
+            keep_transcription_text=config.collator.keep_transcription_text)
         return speech_collator
 
-    def __init__(self, aug_file, mean_std_filepath,  
-                vocab_filepath, spm_model_prefix,
-                random_seed=0,
-                unit_type="char",
-                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-                feat_dim=0,  # 'mfcc', 'fbank'
-                delta_delta=False,  # 'mfcc', 'fbank'
-                stride_ms=10.0,  # ms
-                window_ms=20.0,  # ms
-                n_fft=None,  # fft points
-                max_freq=None,  # None for samplerate/2
-                target_sample_rate=16000,  # target sample rate
-                use_dB_normalization=True,
-                target_dB=-20,
-                dither=1.0,
-                keep_transcription_text=True):
+    def __init__(
+            self,
+            aug_file,
+            mean_std_filepath,
+            vocab_filepath,
+            spm_model_prefix,
+            random_seed=0,
+            unit_type="char",
+            specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
+            feat_dim=0,  # 'mfcc', 'fbank'
+            delta_delta=False,  # 'mfcc', 'fbank'
+            stride_ms=10.0,  # ms
+            window_ms=20.0,  # ms
+            n_fft=None,  # fft points
+            max_freq=None,  # None for samplerate/2
+            target_sample_rate=16000,  # target sample rate
+            use_dB_normalization=True,
+            target_dB=-20,
+            dither=1.0,
+            keep_transcription_text=True):
         """SpeechCollator Collator
 
         Args:
@@ -159,9 +164,8 @@ class SpeechCollator():
 
         self._local_data = TarLocalData(tar2info={}, tar2object={})
         self._augmentation_pipeline = AugmentationPipeline(
-            augmentation_config=aug_file.read(), 
-            random_seed=random_seed)
-        
+            augmentation_config=aug_file.read(), random_seed=random_seed)
+
         self._normalizer = FeatureNormalizer(
             mean_std_filepath) if mean_std_filepath else None
 
@@ -290,8 +294,6 @@ class SpeechCollator():
         text_lens = np.array(text_lens).astype(np.int64)
         return utts, padded_audios, audio_lens, padded_texts, text_lens
 
-
-    
     @property
     def manifest(self):
         return self._manifest
@@ -318,4 +320,4 @@ class SpeechCollator():
 
     @property
     def stride_ms(self):
-        return self._speech_featurizer.stride_ms
\ No newline at end of file
+        return self._speech_featurizer.stride_ms
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index 70383b4d..92c60f35 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -12,19 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
-import tarfile
-import time
-from collections import namedtuple
 from typing import Optional
 
-import numpy as np
 from paddle.io import Dataset
 from yacs.config import CfgNode
 
-from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
-from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
-from deepspeech.frontend.normalizer import FeatureNormalizer
-from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log
 
@@ -46,8 +38,7 @@ class ManifestDataset(Dataset):
                 max_output_len=float('inf'),
                 min_output_len=0.0,
                 max_output_input_ratio=float('inf'),
-                min_output_input_ratio=0.0,
-            ))
+                min_output_input_ratio=0.0, ))
 
         if config is not None:
             config.merge_from_other_cfg(default)
@@ -66,7 +57,6 @@ class ManifestDataset(Dataset):
         assert 'manifest' in config.data
         assert config.data.manifest
 
-
         dataset = cls(
             manifest_path=config.data.manifest,
             max_input_len=config.data.max_input_len,
@@ -74,8 +64,7 @@ class ManifestDataset(Dataset):
             max_output_len=config.data.max_output_len,
             min_output_len=config.data.min_output_len,
             max_output_input_ratio=config.data.max_output_input_ratio,
-            min_output_input_ratio=config.data.min_output_input_ratio,
-            )
+            min_output_input_ratio=config.data.min_output_input_ratio, )
         return dataset
 
     def __init__(self,
@@ -111,7 +100,6 @@ class ManifestDataset(Dataset):
             min_output_input_ratio=min_output_input_ratio)
         self._manifest.sort(key=lambda x: x["feat_shape"][0])
 
-
     def __len__(self):
         return len(self._manifest)
 
diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py
index bcfddaef..238e2d35 100644
--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@@ -905,7 +905,6 @@ class U2InferModel(U2Model):
     def __init__(self, configs: dict):
         super().__init__(configs)
 
-
     def forward(self,
                 feats,
                 feats_lengths,

From 3652b87f33877d4b64b75398f9f99c34b1e5b02e Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Fri, 18 Jun 2021 10:11:17 +0000
Subject: [PATCH 14/14] fix

---
 deepspeech/io/collator.py | 1 -
 deepspeech/io/dataset.py  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index ecf7024c..1061f97c 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
-import time
 from collections import namedtuple
 from typing import Optional
 
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index 92c60f35..3fc4e988 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import io
 from typing import Optional
 
 from paddle.io import Dataset