refactor train

add aishell egs
5 years ago · 6f5b837e54
parent 4926544859
commit 6f5b837e54
8 changed files with 45 additions and 180 deletions
--- a/data_utils/dataset.py
+++ b/data_utils/dataset.py
@ -237,9 +237,9 @@ class DeepSpeech2DistributedBatchSampler(DistributedBatchSampler):
        assert (clipped == False)
        if not clipped:
            res_len = len(indices) - shift_len - len(batch_indices)
-            assert res_len != 0, f"_batch_shuffle clipped {len(indices)} , {shift_len}, {len(batch_indices)}"
            # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
-            batch_indices.extend(indices[-res_len:])
+            if res_len != 0:
+                batch_indices.extend(indices[-res_len:])
            batch_indices.extend(indices[0:shift_len])
            assert len(indices) == len(
                batch_indices
@ -381,9 +381,9 @@ class DeepSpeech2BatchSampler(BatchSampler):
        assert (clipped == False)
        if not clipped:
            res_len = len(indices) - shift_len - len(batch_indices)
-            assert res_len != 0, f"_batch_shuffle clipped {len(indices)} , {shift_len}, {len(batch_indices)}"
            # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
-            batch_indices.extend(indices[-res_len:])
+            if res_len != 0:
+                batch_indices.extend(indices[-res_len:])
            batch_indices.extend(indices[0:shift_len])
            assert len(indices) == len(
                batch_indices
@ -533,107 +533,3 @@ class SpeechCollator():
        texts = np.array(texts).astype('int32')
        text_lens = np.array(text_lens).astype('int64')
        return padded_audios, texts, audio_lens, text_lens
-
-
-# def create_dataloader(manifest_path,
-#                       vocab_filepath,
-#                       mean_std_filepath,
-#                       augmentation_config='{}',
-#                       max_duration=float('inf'),
-#                       min_duration=0.0,
-#                       stride_ms=10.0,
-#                       window_ms=20.0,
-#                       max_freq=None,
-#                       specgram_type='linear',
-#                       use_dB_normalization=True,
-#                       random_seed=0,
-#                       keep_transcription_text=False,
-#                       is_training=False,
-#                       batch_size=1,
-#                       num_workers=0,
-#                       sortagrad=False,
-#                       shuffle_method=None,
-#                       dist=False):
-
-#     dataset = DeepSpeech2Dataset(
-#         manifest_path,
-#         vocab_filepath,
-#         mean_std_filepath,
-#         augmentation_config=augmentation_config,
-#         max_duration=max_duration,
-#         min_duration=min_duration,
-#         stride_ms=stride_ms,
-#         window_ms=window_ms,
-#         max_freq=max_freq,
-#         specgram_type=specgram_type,
-#         use_dB_normalization=use_dB_normalization,
-#         random_seed=random_seed,
-#         keep_transcription_text=keep_transcription_text)
-
-#     if dist:
-#         batch_sampler = DeepSpeech2DistributedBatchSampler(
-#             dataset,
-#             batch_size,
-#             num_replicas=None,
-#             rank=None,
-#             shuffle=is_training,
-#             drop_last=is_training,
-#             sortagrad=is_training,
-#             shuffle_method=shuffle_method)
-#     else:
-#         batch_sampler = DeepSpeech2BatchSampler(
-#             dataset,
-#             shuffle=is_training,
-#             batch_size=batch_size,
-#             drop_last=is_training,
-#             sortagrad=is_training,
-#             shuffle_method=shuffle_method)
-
-#     def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):
-#         """
-#         Padding audio features with zeros to make them have the same shape (or
-#         a user-defined shape) within one bach.
-
-#         If ``padding_to`` is -1, the maximun shape in the batch will be used
-#         as the target shape for padding. Otherwise, `padding_to` will be the
-#         target shape (only refers to the second axis).
-
-#         If `flatten` is True, features will be flatten to 1darray.
-#         """
-#         new_batch = []
-#         # get target shape
-#         max_length = max([audio.shape[1] for audio, text in batch])
-#         if padding_to != -1:
-#             if padding_to < max_length:
-#                 raise ValueError("If padding_to is not -1, it should be larger "
-#                                  "than any instance's shape in the batch")
-#             max_length = padding_to
-#         max_text_length = max([len(text) for audio, text in batch])
-#         # padding
-#         padded_audios = []
-#         audio_lens = []
-#         texts, text_lens = [], []
-#         for audio, text in batch:
-#             padded_audio = np.zeros([audio.shape[0], max_length])
-#             padded_audio[:, :audio.shape[1]] = audio
-#             if flatten:
-#                 padded_audio = padded_audio.flatten()
-#             padded_audios.append(padded_audio)
-#             audio_lens.append(audio.shape[1])
-#             padded_text = np.zeros([max_text_length])
-#             padded_text[:len(text)] = text
-#             texts.append(padded_text)
-#             text_lens.append(len(text))
-
-#         padded_audios = np.array(padded_audios).astype('float32')
-#         audio_lens = np.array(audio_lens).astype('int64')
-#         texts = np.array(texts).astype('int32')
-#         text_lens = np.array(text_lens).astype('int64')
-#         return padded_audios, texts, audio_lens, text_lens
-
-#     loader = DataLoader(
-#         dataset,
-#         batch_sampler=batch_sampler,
-#         collate_fn=partial(padding_batch, is_training=is_training),
-#         num_workers=num_workers, )
-#     return loader
--- a/examples/aishell/conf/deepspeech2.yaml
+++ b/examples/aishell/conf/deepspeech2.yaml
@ -39,13 +39,13 @@ training:
  valid_interval: 1000
 decoding:
  batch_size: 128
-  error_rate_type: wer
+  error_rate_type: cer 
  decoding_method: ctc_beam_search
-  lang_model_path: models/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 500
-  cutoff_prob: 1.0
+  lang_model_path: models/lm/zh_giga.no_cna_cmn.prune01244.klm
+  alpha: 2.6
+  beta: 5.0
+  beam_size: 300
+  cutoff_prob: 0.99 
  cutoff_top_n: 40
  num_proc_bsearch: 8

--- a/examples/aishell/local/run_test.sh
+++ b/examples/aishell/local/run_test.sh
@ -9,30 +9,12 @@ fi
 cd - > /dev/null


-# evaluate model
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+CUDA_VISIBLE_DEVICES=6 \
 python3 -u ${MAIN_ROOT}/test.py \
--batch_size=128 \
--beam_size=300 \
--num_proc_bsearch=8 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=1024 \
--alpha=2.6 \
--beta=5.0 \
--cutoff_prob=0.99 \
--cutoff_top_n=40 \
--use_gru=True \
--use_gpu=True \
--share_rnn_weights=False \
--test_manifest="data/manifest.test" \
--mean_std_path="data/mean_std.npz" \
--vocab_path="data/vocab.txt" \
--model_path="checkpoints/step_final" \
--lang_model_path="${MAIN_ROOT}/models/lm/zh_giga.no_cna_cmn.prune01244.klm" \
--decoding_method="ctc_beam_search" \
--error_rate_type="cer" \
--specgram_type="linear"
+--device 'gpu' \
+--nproc 1 \
+--config conf/deepspeech2.yaml \
+--output ckpt

 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
--- a/examples/aishell/local/run_train.sh
+++ b/examples/aishell/local/run_train.sh
@ -32,7 +32,7 @@ export FLAGS_sync_nccl_allreduce=0
 #--specgram_type="linear" \
 #--shuffle_method="batch_shuffle_clipped" \

-CUDA_VISIBLE_DEVICES=1,2,6,7 \
+CUDA_VISIBLE_DEVICES=2,3,5,7 \
 python3 -u ${MAIN_ROOT}/train.py \
 --device 'gpu' \
 --nproc 4 \
--- a/examples/tiny/conf/deepspeech2.yaml
+++ b/examples/tiny/conf/deepspeech2.yaml
@ -26,7 +26,7 @@ model:
  num_conv_layers: 2
  num_rnn_layers: 3
  rnn_layer_size: 2048
-  use_gru: False 
+  use_gru: True 
  share_rnn_weights: True 
 training:
  n_epoch: 20
--- a/examples/tiny/local/run_test.sh
+++ b/examples/tiny/local/run_test.sh
@ -8,7 +8,7 @@ if [ $? -ne 0 ]; then
 fi
 cd - > /dev/null

-CUDA_VISIBLE_DEVICES=0,1,2,3 \
+CUDA_VISIBLE_DEVICES=0 \
 python3 -u ${MAIN_ROOT}/test.py \
 --device 'gpu' \
 --nproc 1 \
--- a/examples/tiny/local/run_train.sh
+++ b/examples/tiny/local/run_train.sh
@ -3,8 +3,7 @@
 export FLAGS_sync_nccl_allreduce=0

 #CUDA_VISIBLE_DEVICES=0,1,2,3 \
-#CUDA_VISIBLE_DEVICES=0,4,5,6 \
-CUDA_VISIBLE_DEVICES=0 \
+CUDA_VISIBLE_DEVICES=0,1 \
 python3 -u ${MAIN_ROOT}/train.py \
 --device 'gpu' \
 --nproc 1 \
--- a/model_utils/model.py
+++ b/model_utils/model.py
@ -68,32 +68,18 @@ class DeepSpeech2Trainer(Trainer):
        loss = self.criterion(logits, texts, logits_len, texts_len)
        return loss

-    def read_batch(self):
-        """Read a batch from the train_loader.
-        Returns
-        -------
-        List[Tensor]
-            A batch.
-        """
-        try:
-            batch = next(self.iterator)
-        except StopIteration as e:
-            raise e
-        return batch
-
-    def train_batch(self):
+    def train_batch(self, batch_data):
        start = time.time()
-        batch = self.read_batch()
-        data_loader_time = time.time() - start
-
-        self.optimizer.clear_grad()
        self.model.train()
-        audio, text, audio_len, text_len = batch
-        batch_size = audio.shape[0]
+
+        audio, text, audio_len, text_len = batch_data
        outputs = self.model(audio, text, audio_len, text_len)
-        loss = self.compute_losses(batch, outputs)
+        loss = self.compute_losses(batch_data, outputs)
+
        loss.backward()
        self.optimizer.step()
+        self.optimizer.clear_grad()
+
        iteration_time = time.time() - start

        losses_np = {
@ -104,13 +90,9 @@ class DeepSpeech2Trainer(Trainer):
        msg = "Train: Rank: {}, ".format(dist.get_rank())
        msg += "epoch: {}, ".format(self.epoch)
        msg += "step: {}, ".format(self.iteration)
-        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
-                                                  iteration_time)
-        msg += f"batch size: {batch_size}, "
+        msg += "time: {:>.3f}s, ".format(iteration_time)
        msg += ', '.join('{}: {:>.6f}'.format(k, v)
                         for k, v in losses_np.items())
-
-        #if self.iteration % 100 == 0:
        self.logger.info(msg)

        if dist.get_rank() == 0 and self.visualizer:
@ -118,6 +100,14 @@ class DeepSpeech2Trainer(Trainer):
                self.visualizer.add_scalar("train/{}".format(k), v,
                                           self.iteration)

+    def new_epoch(self):
+        """Reset the train loader and increment ``epoch``.
+        """
+        if self.parallel:
+            # batch sampler epoch start from 0
+            self.train_loader.batch_sampler.set_epoch(self.epoch)
+        self.epoch += 1
+
    def train(self):
        """The training process.
        
@ -126,21 +116,20 @@ class DeepSpeech2Trainer(Trainer):
        """
        self.new_epoch()
        while self.epoch <= self.config.training.n_epoch:
-            try:
+            for batch in self.train_loader:
                self.iteration += 1
-                self.train_batch()
+                self.train_batch(batch)

                # if self.iteration % self.config.training.valid_interval == 0:
                #     self.valid()

                # if self.iteration % self.config.training.save_interval == 0:
                #     self.save()
-            except StopIteration:
-                self.iteration -= 1  #epoch end, iteration ahead 1
-                self.valid()
-                self.save()
-                self.lr_scheduler.step()
-                self.new_epoch()
+
+            self.valid()
+            self.save()
+            self.lr_scheduler.step()
+            self.new_epoch()

    def compute_metrics(self, inputs, outputs):
        pass
@ -152,14 +141,13 @@ class DeepSpeech2Trainer(Trainer):
        valid_losses = defaultdict(list)
        for i, batch in enumerate(self.valid_loader):
            audio, text, audio_len, text_len = batch
-            batch_size = audio.shape[0]
            outputs = self.model(audio, text, audio_len, text_len)
            loss = self.compute_losses(batch, outputs)
            metrics = self.compute_metrics(batch, outputs)

            valid_losses['val_loss'].append(float(loss))
            valid_losses['val_loss_div_batchsize'].append(
-                float(loss) / batch_size)
+                float(loss) / self.config.data.batch_size)

        # write visual log
        valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}