refactor train

add aishell egs
5 years ago · 6f5b837e54
parent 4926544859
commit 6f5b837e54
8 changed files with 45 additions and 180 deletions
--- a/data_utils/dataset.py
+++ b/data_utils/dataset.py
@ -237,9 +237,9 @@ class DeepSpeech2DistributedBatchSampler(DistributedBatchSampler):
        assert (clipped == False)
        if not clipped:
            res_len = len(indices) - shift_len - len(batch_indices)
            assert res_len != 0, f"_batch_shuffle clipped {len(indices)} , {shift_len}, {len(batch_indices)}"
            # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
-            batch_indices.extend(indices[-res_len:])
+            if res_len != 0:
                batch_indices.extend(indices[-res_len:])
            batch_indices.extend(indices[0:shift_len])
            assert len(indices) == len(
                batch_indices
@ -381,9 +381,9 @@ class DeepSpeech2BatchSampler(BatchSampler):
        assert (clipped == False)
        if not clipped:
            res_len = len(indices) - shift_len - len(batch_indices)
            assert res_len != 0, f"_batch_shuffle clipped {len(indices)} , {shift_len}, {len(batch_indices)}"
            # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
-            batch_indices.extend(indices[-res_len:])
+            if res_len != 0:
                batch_indices.extend(indices[-res_len:])
            batch_indices.extend(indices[0:shift_len])
            assert len(indices) == len(
                batch_indices
@ -532,108 +532,4 @@ class SpeechCollator():
        audio_lens = np.array(audio_lens).astype('int64')
        texts = np.array(texts).astype('int32')
        text_lens = np.array(text_lens).astype('int64')
-        return padded_audios, texts, audio_lens, text_lens
+        return padded_audios, texts, audio_lens, text_lens
 # def create_dataloader(manifest_path,
 #                       vocab_filepath,
 #                       mean_std_filepath,
 #                       augmentation_config='{}',
 #                       max_duration=float('inf'),
 #                       min_duration=0.0,
 #                       stride_ms=10.0,
 #                       window_ms=20.0,
 #                       max_freq=None,
 #                       specgram_type='linear',
 #                       use_dB_normalization=True,
 #                       random_seed=0,
 #                       keep_transcription_text=False,
 #                       is_training=False,
 #                       batch_size=1,
 #                       num_workers=0,
 #                       sortagrad=False,
 #                       shuffle_method=None,
 #                       dist=False):
 #     dataset = DeepSpeech2Dataset(
 #         manifest_path,
 #         vocab_filepath,
 #         mean_std_filepath,
 #         augmentation_config=augmentation_config,
 #         max_duration=max_duration,
 #         min_duration=min_duration,
 #         stride_ms=stride_ms,
 #         window_ms=window_ms,
 #         max_freq=max_freq,
 #         specgram_type=specgram_type,
 #         use_dB_normalization=use_dB_normalization,
 #         random_seed=random_seed,
 #         keep_transcription_text=keep_transcription_text)
 #     if dist:
 #         batch_sampler = DeepSpeech2DistributedBatchSampler(
 #             dataset,
 #             batch_size,
 #             num_replicas=None,
 #             rank=None,
 #             shuffle=is_training,
 #             drop_last=is_training,
 #             sortagrad=is_training,
 #             shuffle_method=shuffle_method)
 #     else:
 #         batch_sampler = DeepSpeech2BatchSampler(
 #             dataset,
 #             shuffle=is_training,
 #             batch_size=batch_size,
 #             drop_last=is_training,
 #             sortagrad=is_training,
 #             shuffle_method=shuffle_method)
 #     def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):
 #         """
 #         Padding audio features with zeros to make them have the same shape (or
 #         a user-defined shape) within one bach.
 #         If ``padding_to`` is -1, the maximun shape in the batch will be used
 #         as the target shape for padding. Otherwise, `padding_to` will be the
 #         target shape (only refers to the second axis).
 #         If `flatten` is True, features will be flatten to 1darray.
 #         """
 #         new_batch = []
 #         # get target shape
 #         max_length = max([audio.shape[1] for audio, text in batch])
 #         if padding_to != -1:
 #             if padding_to < max_length:
 #                 raise ValueError("If padding_to is not -1, it should be larger "
 #                                  "than any instance's shape in the batch")
 #             max_length = padding_to
 #         max_text_length = max([len(text) for audio, text in batch])
 #         # padding
 #         padded_audios = []
 #         audio_lens = []
 #         texts, text_lens = [], []
 #         for audio, text in batch:
 #             padded_audio = np.zeros([audio.shape[0], max_length])
 #             padded_audio[:, :audio.shape[1]] = audio
 #             if flatten:
 #                 padded_audio = padded_audio.flatten()
 #             padded_audios.append(padded_audio)
 #             audio_lens.append(audio.shape[1])
 #             padded_text = np.zeros([max_text_length])
 #             padded_text[:len(text)] = text
 #             texts.append(padded_text)
 #             text_lens.append(len(text))
 #         padded_audios = np.array(padded_audios).astype('float32')
 #         audio_lens = np.array(audio_lens).astype('int64')
 #         texts = np.array(texts).astype('int32')
 #         text_lens = np.array(text_lens).astype('int64')
 #         return padded_audios, texts, audio_lens, text_lens
 #     loader = DataLoader(
 #         dataset,
 #         batch_sampler=batch_sampler,
 #         collate_fn=partial(padding_batch, is_training=is_training),
 #         num_workers=num_workers, )
 #     return loader
--- a/examples/aishell/conf/deepspeech2.yaml
+++ b/examples/aishell/conf/deepspeech2.yaml
@ -39,13 +39,13 @@ training:
  valid_interval: 1000
 decoding:
  batch_size: 128
-  error_rate_type: wer
+  error_rate_type: cer 
  decoding_method: ctc_beam_search
-  lang_model_path: models/lm/common_crawl_00.prune01111.trie.klm
+  lang_model_path: models/lm/zh_giga.no_cna_cmn.prune01244.klm
-  alpha: 2.5
+  alpha: 2.6
-  beta: 0.3
+  beta: 5.0
-  beam_size: 500
+  beam_size: 300
-  cutoff_prob: 1.0
+  cutoff_prob: 0.99 
  cutoff_top_n: 40
  num_proc_bsearch: 8
--- a/examples/aishell/local/run_test.sh
+++ b/examples/aishell/local/run_test.sh
@ -9,30 +9,12 @@ fi
 cd - > /dev/null
-# evaluate model
+CUDA_VISIBLE_DEVICES=6 \
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 python3 -u ${MAIN_ROOT}/test.py \
--batch_size=128 \
+--device 'gpu' \
--beam_size=300 \
+--nproc 1 \
--num_proc_bsearch=8 \
+--config conf/deepspeech2.yaml \
--num_conv_layers=2 \
+--output ckpt
 --num_rnn_layers=3 \
 --rnn_layer_size=1024 \
 --alpha=2.6 \
 --beta=5.0 \
 --cutoff_prob=0.99 \
 --cutoff_top_n=40 \
 --use_gru=True \
 --use_gpu=True \
 --share_rnn_weights=False \
 --test_manifest="data/manifest.test" \
 --mean_std_path="data/mean_std.npz" \
 --vocab_path="data/vocab.txt" \
 --model_path="checkpoints/step_final" \
 --lang_model_path="${MAIN_ROOT}/models/lm/zh_giga.no_cna_cmn.prune01244.klm" \
 --decoding_method="ctc_beam_search" \
 --error_rate_type="cer" \
 --specgram_type="linear"
 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
--- a/examples/aishell/local/run_train.sh
+++ b/examples/aishell/local/run_train.sh
@ -32,7 +32,7 @@ export FLAGS_sync_nccl_allreduce=0
 #--specgram_type="linear" \
 #--shuffle_method="batch_shuffle_clipped" \
-CUDA_VISIBLE_DEVICES=1,2,6,7 \
+CUDA_VISIBLE_DEVICES=2,3,5,7 \
 python3 -u ${MAIN_ROOT}/train.py \
 --device 'gpu' \
 --nproc 4 \
--- a/examples/tiny/conf/deepspeech2.yaml
+++ b/examples/tiny/conf/deepspeech2.yaml
@ -25,8 +25,8 @@ data:
 model:
  num_conv_layers: 2
  num_rnn_layers: 3
-  rnn_layer_size: 2048 
+  rnn_layer_size: 2048
-  use_gru: False 
+  use_gru: True 
  share_rnn_weights: True 
 training:
  n_epoch: 20
--- a/examples/tiny/local/run_test.sh
+++ b/examples/tiny/local/run_test.sh
@ -8,7 +8,7 @@ if [ $? -ne 0 ]; then
 fi
 cd - > /dev/null
-CUDA_VISIBLE_DEVICES=0,1,2,3 \
+CUDA_VISIBLE_DEVICES=0 \
 python3 -u ${MAIN_ROOT}/test.py \
 --device 'gpu' \
 --nproc 1 \
--- a/examples/tiny/local/run_train.sh
+++ b/examples/tiny/local/run_train.sh
@ -3,8 +3,7 @@
 export FLAGS_sync_nccl_allreduce=0
 #CUDA_VISIBLE_DEVICES=0,1,2,3 \
-#CUDA_VISIBLE_DEVICES=0,4,5,6 \
+CUDA_VISIBLE_DEVICES=0,1 \
 CUDA_VISIBLE_DEVICES=0 \
 python3 -u ${MAIN_ROOT}/train.py \
 --device 'gpu' \
 --nproc 1 \
--- a/model_utils/model.py
+++ b/model_utils/model.py
@ -68,32 +68,18 @@ class DeepSpeech2Trainer(Trainer):
        loss = self.criterion(logits, texts, logits_len, texts_len)
        return loss
-    def read_batch(self):
+    def train_batch(self, batch_data):
        """Read a batch from the train_loader.
        Returns
        -------
        List[Tensor]
            A batch.
        """
        try:
            batch = next(self.iterator)
        except StopIteration as e:
            raise e
        return batch
    def train_batch(self):
        start = time.time()
        batch = self.read_batch()
        data_loader_time = time.time() - start
        self.optimizer.clear_grad()
        self.model.train()
-        audio, text, audio_len, text_len = batch
+
-        batch_size = audio.shape[0]
+        audio, text, audio_len, text_len = batch_data
        outputs = self.model(audio, text, audio_len, text_len)
-        loss = self.compute_losses(batch, outputs)
+        loss = self.compute_losses(batch_data, outputs)
        loss.backward()
        self.optimizer.step()
        self.optimizer.clear_grad()
        iteration_time = time.time() - start
        losses_np = {
@ -104,13 +90,9 @@ class DeepSpeech2Trainer(Trainer):
        msg = "Train: Rank: {}, ".format(dist.get_rank())
        msg += "epoch: {}, ".format(self.epoch)
        msg += "step: {}, ".format(self.iteration)
-        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
+        msg += "time: {:>.3f}s, ".format(iteration_time)
                                                  iteration_time)
        msg += f"batch size: {batch_size}, "
        msg += ', '.join('{}: {:>.6f}'.format(k, v)
                         for k, v in losses_np.items())
        #if self.iteration % 100 == 0:
        self.logger.info(msg)
        if dist.get_rank() == 0 and self.visualizer:
@ -118,6 +100,14 @@ class DeepSpeech2Trainer(Trainer):
                self.visualizer.add_scalar("train/{}".format(k), v,
                                           self.iteration)
    def new_epoch(self):
        """Reset the train loader and increment ``epoch``.
        """
        if self.parallel:
            # batch sampler epoch start from 0
            self.train_loader.batch_sampler.set_epoch(self.epoch)
        self.epoch += 1
    def train(self):
        """The training process.
@ -126,21 +116,20 @@ class DeepSpeech2Trainer(Trainer):
        """
        self.new_epoch()
        while self.epoch <= self.config.training.n_epoch:
-            try:
+            for batch in self.train_loader:
                self.iteration += 1
-                self.train_batch()
+                self.train_batch(batch)
                # if self.iteration % self.config.training.valid_interval == 0:
                #     self.valid()
                # if self.iteration % self.config.training.save_interval == 0:
                #     self.save()
-            except StopIteration:
+
-                self.iteration -= 1  #epoch end, iteration ahead 1
+            self.valid()
-                self.valid()
+            self.save()
-                self.save()
+            self.lr_scheduler.step()
-                self.lr_scheduler.step()
+            self.new_epoch()
                self.new_epoch()
    def compute_metrics(self, inputs, outputs):
        pass
@ -152,14 +141,13 @@ class DeepSpeech2Trainer(Trainer):
        valid_losses = defaultdict(list)
        for i, batch in enumerate(self.valid_loader):
            audio, text, audio_len, text_len = batch
            batch_size = audio.shape[0]
            outputs = self.model(audio, text, audio_len, text_len)
            loss = self.compute_losses(batch, outputs)
            metrics = self.compute_metrics(batch, outputs)
            valid_losses['val_loss'].append(float(loss))
            valid_losses['val_loss_div_batchsize'].append(
-                float(loss) / batch_size)
+                float(loss) / self.config.data.batch_size)
        # write visual log
        valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}