move redundant params

4 years ago · 557427736e
parent 698d7a9bdb
commit 557427736e
9 changed files with 96 additions and 182 deletions
--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
@ -21,32 +21,18 @@ _C.data = CN(
        train_manifest="",
        dev_manifest="",
        test_manifest="",
        unit_type="char",
        vocab_filepath="",
        spm_model_prefix="",
        mean_std_filepath="",
        augmentation_config="",
        max_duration=float('inf'),
        min_duration=0.0,
    ))
 _C.model = CN(
    dict(
        num_conv_layers=2,  #Number of stacking convolution layers.
        num_rnn_layers=3,  #Number of stacking RNN layers.
        rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
        use_gru=True,  #Use gru if set True. Use simple rnn if set False.
        share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
    ))
 _C.collator =CN(
    dict(
        augmentation_config="",
        random_seed=0,
        mean_std_filepath="",
        unit_type="char",
        vocab_filepath="",
        spm_model_prefix="",
        mean_std_filepath="",
        augmentation_config="",
        random_seed=0,
        specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
        feat_dim=0,  # 'mfcc', 'fbank'
        delta_delta=False,  # 'mfcc', 'fbank'
@ -65,6 +51,16 @@ _C.collator =CN(
        shuffle_method="batch_shuffle",  # 'batch_shuffle', 'instance_shuffle'
    ))
 _C.model = CN(
    dict(
        num_conv_layers=2,  #Number of stacking convolution layers.
        num_rnn_layers=3,  #Number of stacking RNN layers.
        rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
        use_gru=True,  #Use gru if set True. Use simple rnn if set False.
        share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
    ))
 DeepSpeech2Model.params(_C.model)
 _C.training = CN(
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -143,7 +143,6 @@ class DeepSpeech2Trainer(Trainer):
        train_dataset = ManifestDataset.from_config(config)
        config.data.manifest = config.data.dev_manifest
        config.data.augmentation_config = ""
        dev_dataset = ManifestDataset.from_config(config)
        if self.parallel:
@ -165,18 +164,22 @@ class DeepSpeech2Trainer(Trainer):
                sortagrad=config.collator.sortagrad,
                shuffle_method=config.collator.shuffle_method)
-        collate_fn = SpeechCollator.from_config(config)
+        collate_fn_train = SpeechCollator.from_config(config)
        config.collator.augmentation_config = ""
        collate_fn_dev = SpeechCollator.from_config(config)
        self.train_loader = DataLoader(
            train_dataset,
            batch_sampler=batch_sampler,
-            collate_fn=collate_fn,
+            collate_fn=collate_fn_train,
            num_workers=config.collator.num_workers)
        self.valid_loader = DataLoader(
            dev_dataset,
            batch_size=config.collator.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn)
+            collate_fn=collate_fn_dev)
        logger.info("Setup train/valid Dataloader!")
@ -324,8 +327,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        # return raw text
        config.data.manifest = config.data.test_manifest
        config.data.keep_transcription_text = True
        config.data.augmentation_config = ""
        # filter test examples, will cause less examples, but no mismatch with training
        # and can use large batch size , save training time, so filter test egs now.
        # config.data.min_input_len = 0.0  # second
@ -337,6 +338,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        test_dataset = ManifestDataset.from_config(config)
        config.collator.keep_transcription_text = True
        config.collator.augmentation_config = ""
        # return text ord id
        self.test_loader = DataLoader(
            test_dataset,
--- a/deepspeech/exps/u2/config.py
+++ b/deepspeech/exps/u2/config.py
@ -17,21 +17,13 @@ from deepspeech.exps.u2.model import U2Tester
 from deepspeech.exps.u2.model import U2Trainer
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.u2 import U2Model
 from deepspeech.io.collator import SpeechCollator
 _C = CfgNode()
 _C.data = ManifestDataset.params()
-_C.collator =CfgNode(
+_C.collator = SpeechCollator.params()
    dict(
        augmentation_config="",
        unit_type="char",
        keep_transcription_text=False,
        batch_size=32,  # batch size
        num_workers=0,  # data loader workers
        sortagrad=False,  # sorted in first epoch when True
        shuffle_method="batch_shuffle"  # 'batch_shuffle', 'instance_shuffle'
    ))
 _C.model = U2Model.params()
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -100,7 +100,7 @@ class U2Trainer(Trainer):
        if (batch_index + 1) % train_conf.log_interval == 0:
            msg += "train time: {:>.3f}s, ".format(iteration_time)
-            msg += "batch size: {}, ".format(self.config.data.batch_size)
+            msg += "batch size: {}, ".format(self.config.collator.batch_size)
            msg += "accum: {}, ".format(train_conf.accum_grad)
            msg += ', '.join('{}: {:>.6f}'.format(k, v)
                             for k, v in losses_np.items())
@ -211,51 +211,52 @@ class U2Trainer(Trainer):
    def setup_dataloader(self):
        config = self.config.clone()
        config.defrost()
-        config.data.keep_transcription_text = False
+        config.collator.keep_transcription_text = False
        # train/valid dataset, return token ids
        config.data.manifest = config.data.train_manifest
        train_dataset = ManifestDataset.from_config(config)
        config.data.manifest = config.data.dev_manifest
        config.data.augmentation_config = ""
        dev_dataset = ManifestDataset.from_config(config)
-        collate_fn = SpeechCollator.from_config(config)
+        collate_fn_train = SpeechCollator.from_config(config)
        config.collator.augmentation_config = ""
        collate_fn_dev = SpeechCollator.from_config(config)
        if self.parallel:
            batch_sampler = SortagradDistributedBatchSampler(
                train_dataset,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                num_replicas=None,
                rank=None,
                shuffle=True,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
+                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                shuffle_method=config.collator.shuffle_method)
        else:
            batch_sampler = SortagradBatchSampler(
                train_dataset,
                shuffle=True,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
+                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                shuffle_method=config.collator.shuffle_method)
        self.train_loader = DataLoader(
            train_dataset,
            batch_sampler=batch_sampler,
-            collate_fn=collate_fn,
+            collate_fn=collate_fn_train,
-            num_workers=config.data.num_workers, )
+            num_workers=config.collator.num_workers, )
        self.valid_loader = DataLoader(
            dev_dataset,
-            batch_size=config.data.batch_size,
+            batch_size=config.collator.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn)
+            collate_fn=collate_fn_dev)
        # test dataset, return raw text
        config.data.manifest = config.data.test_manifest
        config.data.keep_transcription_text = True
        config.data.augmentation_config = ""
        # filter test examples, will cause less examples, but no mismatch with training
        # and can use large batch size , save training time, so filter test egs now.
        # config.data.min_input_len = 0.0  # second
@ -264,9 +265,11 @@ class U2Trainer(Trainer):
        # config.data.max_output_len = float('inf')  # tokens
        # config.data.min_output_input_ratio = 0.00
        # config.data.max_output_input_ratio = float('inf')
        test_dataset = ManifestDataset.from_config(config)
        # return text ord id
        config.collator.keep_transcription_text = True
        config.collator.augmentation_config = ""
        self.test_loader = DataLoader(
            test_dataset,
            batch_size=config.decoding.batch_size,
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@ -75,8 +75,8 @@ class SpeechCollator():
        """
        assert 'augmentation_config' in config.collator
        assert 'keep_transcription_text' in config.collator
-        assert 'mean_std_filepath' in config.data
+        assert 'mean_std_filepath' in config.collator
-        assert 'vocab_filepath' in config.data
+        assert 'vocab_filepath' in config.collator
        assert 'specgram_type' in config.collator
        assert 'n_fft' in config.collator
        assert config.collator
@ -94,9 +94,9 @@ class SpeechCollator():
        speech_collator = cls(
                aug_file=aug_file,
                random_seed=0,
-                mean_std_filepath=config.data.mean_std_filepath,
+                mean_std_filepath=config.collator.mean_std_filepath,
                unit_type=config.collator.unit_type,
-                vocab_filepath=config.data.vocab_filepath,
+                vocab_filepath=config.collator.vocab_filepath,
                spm_model_prefix=config.collator.spm_model_prefix,
                specgram_type=config.collator.specgram_type, 
                feat_dim=config.collator.feat_dim, 
@ -129,11 +129,31 @@ class SpeechCollator():
                target_dB=-20,
                dither=1.0,
                keep_transcription_text=True):
-        """
+        """SpeechCollator Collator
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one bach.
-        if ``keep_transcription_text`` is False, text is token ids else is raw string.
+        Args:
            unit_type(str): token unit type, e.g. char, word, spm
            vocab_filepath (str): vocab file path.
            mean_std_filepath (str): mean and std file path, which suffix is *.npy
            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
            window_ms (float, optional): window size in ms. Defaults to 20.0.
            n_fft (int, optional): fft points for rfft. Defaults to None.
            max_freq (int, optional): max cut freq. Defaults to None.
            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
            target_dB (int, optional): target dB. Defaults to -20.
            random_seed (int, optional): for random generator. Defaults to 0.
            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
            if ``keep_transcription_text`` is False, text is token ids else is raw string.
        Do augmentations 
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one batch.
        """
        self._keep_transcription_text = keep_transcription_text
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@ -40,15 +40,7 @@ class ManifestDataset(Dataset):
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                train_manifest="",
                dev_manifest="",
                test_manifest="",
                manifest="",
                unit_type="char",
                vocab_filepath="",
                spm_model_prefix="",
                mean_std_filepath="",
                augmentation_config="",
                max_input_len=27.0,
                min_input_len=0.0,
                max_output_len=float('inf'),
@ -73,25 +65,10 @@ class ManifestDataset(Dataset):
        """
        assert 'manifest' in config.data
        assert config.data.manifest
-        assert 'keep_transcription_text' in config.collator
+
        if isinstance(config.data.augmentation_config, (str, bytes)):
            if config.data.augmentation_config:
                aug_file = io.open(
                    config.data.augmentation_config, mode='r', encoding='utf8')
            else:
                aug_file = io.StringIO(initial_value='{}', newline='')
        else:
            aug_file = config.data.augmentation_config
            assert isinstance(aug_file, io.StringIO)
        dataset = cls(
            manifest_path=config.data.manifest,
            unit_type=config.data.unit_type,
            vocab_filepath=config.data.vocab_filepath,
            mean_std_filepath=config.data.mean_std_filepath,
            spm_model_prefix=config.data.spm_model_prefix,
            augmentation_config=aug_file.read(),
            max_input_len=config.data.max_input_len,
            min_input_len=config.data.min_input_len,
            max_output_len=config.data.max_output_len,
@ -101,23 +78,8 @@ class ManifestDataset(Dataset):
            )
        return dataset
    def _read_vocab(self, vocab_filepath):
        """Load vocabulary from file."""
        vocab_lines = []
        with open(vocab_filepath, 'r', encoding='utf-8') as file:
            vocab_lines.extend(file.readlines())
        vocab_list = [line[:-1] for line in vocab_lines]
        return vocab_list
    def __init__(self,
                 manifest_path,
                 unit_type,
                 vocab_filepath,
                 mean_std_filepath,
                 spm_model_prefix=None,
                 augmentation_config='{}',
                 max_input_len=float('inf'),
                 min_input_len=0.0,
                 max_output_len=float('inf'),
@ -128,34 +90,16 @@ class ManifestDataset(Dataset):
        Args:
            manifest_path (str): manifest josn file path
            unit_type(str): token unit type, e.g. char, word, spm
            vocab_filepath (str): vocab file path.
            mean_std_filepath (str): mean and std file path, which suffix is *.npy
            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
            max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
            min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
            max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
            min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
-            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
+        
            window_ms (float, optional): window size in ms. Defaults to 20.0.
            n_fft (int, optional): fft points for rfft. Defaults to None.
            max_freq (int, optional): max cut freq. Defaults to None.
            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
            target_dB (int, optional): target dB. Defaults to -20.
            random_seed (int, optional): for random generator. Defaults to 0.
            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
        """
        super().__init__()
        # self._rng = np.random.RandomState(random_seed)
        # read manifest
        self._manifest = read_manifest(
            manifest_path=manifest_path,
@ -167,51 +111,6 @@ class ManifestDataset(Dataset):
            min_output_input_ratio=min_output_input_ratio)
        self._manifest.sort(key=lambda x: x["feat_shape"][0])
        # self._vocab_list = self._read_vocab(vocab_filepath)
    # @property
    # def manifest(self):
    #     return self._manifest
    # @property
    # def vocab_size(self):
    #     """Return the vocabulary size.
    #     Returns:
    #         int: Vocabulary size.
    #     """
    #     return len(self._vocab_list)
    # @property
    # def vocab_list(self):
    #     """Return the vocabulary in list.
    #     Returns:
    #         List[str]: 
    #     """
    #     return self._vocab_list
    # @property
    # def vocab_dict(self):
    #     """Return the vocabulary in dict.
    #     Returns:
    #         Dict[str, int]: 
    #     """
    #     vocab_dict = dict(
    #         [(token, idx) for (idx, token) in enumerate(self._vocab_list)])
    #     return vocab_dict
    # @property
    # def feature_size(self):
    #     """Return the audio feature size.
    #     Returns:
    #         int: audio feature size.
    #     """
    #     return self._manifest[0]["feat_shape"][-1]
    def __len__(self):
        return len(self._manifest)
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@ -3,17 +3,20 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/augmentation.json
  batch_size: 64
  min_input_len: 0.5
  max_input_len: 20.0 # second
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
 collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/augmentation.json
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80
@ -32,7 +35,6 @@ data:
  shuffle_method: batch_shuffle
  num_workers: 2
 # network architecture
 model:
    cmvn_file: "data/mean_std.json"
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@ -2,10 +2,7 @@
 data:
  train_manifest: data/manifest.tiny
  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
+  test_manifest: data/manifest.tiny 
  mean_std_filepath: data/mean_std.json
  unit_type: char
  vocab_filepath: data/vocab.txt 
  min_input_len: 0.0
  max_input_len: 27.0
  min_output_len: 0.0
@ -15,6 +12,9 @@ data:
 collator:
  mean_std_filepath: data/mean_std.json
  unit_type: char
  vocab_filepath: data/vocab.txt
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
@ -43,7 +43,7 @@ model:
  share_rnn_weights: True 
 training:
-  n_epoch: 23
+  n_epoch: 24
  lr: 1e-5 
  lr_decay: 1.0 
  weight_decay: 1e-06
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@ -3,26 +3,20 @@ data:
  train_manifest: data/manifest.tiny
  dev_manifest: data/manifest.tiny
  test_manifest: data/manifest.tiny
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
  mean_std_filepath: ""
  batch_size: 4
  min_input_len: 0.5  # second
  max_input_len: 20.0 # second
  min_output_len: 0.0 # tokens
  max_output_len: 400.0 # tokens
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
-  raw_wav: True  # use raw_wav or kaldi feature
+  
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 0 #2
 collator:
  vocab_filepath: data/vocab.txt 
  mean_std_filepath: ""
  augmentation_config: conf/augmentation.json
  random_seed: 0
-  spm_model_prefix: 
+  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
  specgram_type: fbank
  feat_dim: 80
  delta_delta: False
@ -35,6 +29,12 @@ collator:
  target_dB: -20
  dither: 1.0
  keep_transcription_text: False
  batch_size: 4
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 0 #2
  raw_wav: True  # use raw_wav or kaldi feature
 # network architecture
 model: