fix ctc loss

lr schedule sortagrad logger
5 years ago · aeb09d812b
parent 45ae60b198
commit aeb09d812b
7 changed files with 147 additions and 51 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,4 @@
 *.pyc
 tools/venv
 dataset
 models/*
--- a/data_utils/dataset.py
+++ b/data_utils/dataset.py
@ -15,6 +15,7 @@
 import math
 import random
 import tarfile
 import logging
 import numpy as np
 import paddle
 from paddle.io import Dataset
@ -30,6 +31,8 @@ from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
 from data_utils.speech import SpeechSegment
 from data_utils.normalizer import FeatureNormalizer
 logger = logging.getLogger(__name__)
 __all__ = [
    "DeepSpeech2Dataset",
    "DeepSpeech2DistributedBatchSampler",
@ -234,9 +237,13 @@ class DeepSpeech2DistributedBatchSampler(DistributedBatchSampler):
        assert (clipped == False)
        if not clipped:
            res_len = len(indices) - shift_len - len(batch_indices)
            assert res_len != 0, f"_batch_shuffle clipped {len(indices)} , {shift_len}, {len(batch_indices)}"
            # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
            batch_indices.extend(indices[-res_len:])
            batch_indices.extend(indices[0:shift_len])
            assert len(indices) == len(
                batch_indices
            ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
        return batch_indices
    def __iter__(self):
@ -247,8 +254,8 @@ class DeepSpeech2DistributedBatchSampler(DistributedBatchSampler):
        # sort (by duration) or batch-wise shuffle the manifest
        if self.shuffle:
-            if self.epoch == 0 and self.sortagrad:
+            if self.epoch == 0 and self._sortagrad:
-                pass
+                logger.info(f'dataset sortagrad! epoch {self.epoch}')
            else:
                if self._shuffle_method == "batch_shuffle":
                    indices = self._batch_shuffle(
@ -374,9 +381,13 @@ class DeepSpeech2BatchSampler(BatchSampler):
        assert (clipped == False)
        if not clipped:
            res_len = len(indices) - shift_len - len(batch_indices)
            assert res_len != 0, f"_batch_shuffle clipped {len(indices)} , {shift_len}, {len(batch_indices)}"
            # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
            batch_indices.extend(indices[-res_len:])
            batch_indices.extend(indices[0:shift_len])
            assert len(indices) == len(
                batch_indices
            ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
        return batch_indices
    def __iter__(self):
@ -388,7 +399,7 @@ class DeepSpeech2BatchSampler(BatchSampler):
        # sort (by duration) or batch-wise shuffle the manifest
        if self.shuffle:
            if self.epoch == 0 and self._sortagrad:
-                pass
+                logger.info(f'dataset sortagrad! epoch {self.epoch}')
            else:
                if self._shuffle_method == "batch_shuffle":
                    indices = self._batch_shuffle(
--- a/examples/aishell/local/run_train.sh
+++ b/examples/aishell/local/run_train.sh
@ -3,33 +3,42 @@
 # train model
 # if you wish to resume from an exists model, uncomment --init_from_pretrained_model
 export FLAGS_sync_nccl_allreduce=0
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+
 #CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 #python3 -u ${MAIN_ROOT}/train.py \
 #--batch_size=64 \
 #--num_epoch=50 \
 #--num_conv_layers=2 \
 #--num_rnn_layers=3 \
 #--rnn_layer_size=1024 \
 #--num_iter_print=100 \
 #--save_epoch=1 \
 #--num_samples=120000 \
 #--learning_rate=5e-4 \
 #--max_duration=27.0 \
 #--min_duration=0.0 \
 #--test_off=False \
 #--use_sortagrad=True \
 #--use_gru=True \
 #--use_gpu=True \
 #--is_local=True \
 #--share_rnn_weights=False \
 #--train_manifest="data/manifest.train" \
 #--dev_manifest="data/manifest.dev" \
 #--mean_std_path="data/mean_std.npz" \
 #--vocab_path="data/vocab.txt" \
 #--output_model_dir="./checkpoints" \
 #--augment_conf_path="${MAIN_ROOT}/conf/augmentation.config" \
 #--specgram_type="linear" \
 #--shuffle_method="batch_shuffle_clipped" \
 CUDA_VISIBLE_DEVICES=1,2,6,7 \
 python3 -u ${MAIN_ROOT}/train.py \
--batch_size=64 \
+--device 'gpu' \
--num_epoch=50 \
+--nproc 4 \
--num_conv_layers=2 \
+--config conf/deepspeech2.yaml \
--num_rnn_layers=3 \
+--output ckpt
--rnn_layer_size=1024 \
+
 --num_iter_print=100 \
 --save_epoch=1 \
 --num_samples=120000 \
 --learning_rate=5e-4 \
 --max_duration=27.0 \
 --min_duration=0.0 \
 --test_off=False \
 --use_sortagrad=True \
 --use_gru=True \
 --use_gpu=True \
 --is_local=True \
 --share_rnn_weights=False \
 --train_manifest="data/manifest.train" \
 --dev_manifest="data/manifest.dev" \
 --mean_std_path="data/mean_std.npz" \
 --vocab_path="data/vocab.txt" \
 --output_model_dir="./checkpoints" \
 --augment_conf_path="${MAIN_ROOT}/conf/augmentation.config" \
 --specgram_type="linear" \
 --shuffle_method="batch_shuffle_clipped" \
 if [ $? -ne 0 ]; then
    echo "Failed in training!"
--- a/examples/tiny/local/run_train.sh
+++ b/examples/tiny/local/run_train.sh
@ -2,10 +2,12 @@
 export FLAGS_sync_nccl_allreduce=0
-CUDA_VISIBLE_DEVICES=0,1,2,3 \
+#CUDA_VISIBLE_DEVICES=0,1,2,3 \
 #CUDA_VISIBLE_DEVICES=0,4,5,6 \
 CUDA_VISIBLE_DEVICES=0 \
 python3 -u ${MAIN_ROOT}/train.py \
 --device 'gpu' \
--nproc 4 \
+--nproc 1 \
 --config conf/deepspeech2.yaml \
 --output ckpt
--- a/model_utils/model.py
+++ b/model_utils/model.py
@ -41,6 +41,22 @@ from decoders.swig_wrapper import ctc_beam_search_decoder_batch
 from utils.error_rate import char_errors, word_errors, cer, wer
 # class ExponentialDecayOld(LRScheduler):
 #     def __init__(self, learning_rate, gamma, last_epoch=-1, 
 #                 decay_steps, decay_rate, staircase=False, verbose=False):
 #         self.learning_rate = learning_rate
 #         self.decay_steps = decay_steps
 #         self.decay_rate = decay_rate
 #         self.staircase = staircase
 #         super(ExponentialDecay, self).__init__(learning_rate, last_epoch,
 #                                                verbose)
 #     def get_lr(self):
 #         div_res = self.step_num / self.decay_steps
 #         if self.staircase:
 #             div_res = paddle.floor(div_res)
 #         return self.base_lr * (self.decay_rate**div_res)
 class DeepSpeech2Trainer(Trainer):
    def __init__(self, config, args):
@ -73,21 +89,28 @@ class DeepSpeech2Trainer(Trainer):
        self.optimizer.clear_grad()
        self.model.train()
        audio, text, audio_len, text_len = batch
        batch_size = audio.shape[0]
        outputs = self.model(audio, text, audio_len, text_len)
        loss = self.compute_losses(batch, outputs)
        loss.backward()
        self.optimizer.step()
        iteration_time = time.time() - start
-        losses_np = {'train_loss': float(loss)}
+        losses_np = {
            'train_loss': float(loss),
            'train_loss_div_batchsize':
            float(loss) / self.config.data.batch_size
        }
        msg = "Train: Rank: {}, ".format(dist.get_rank())
        msg += "epoch: {}, ".format(self.epoch)
        msg += "step: {}, ".format(self.iteration)
        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
                                                  iteration_time)
        msg += f"batch size: {batch_size}, "
        msg += ', '.join('{}: {:>.6f}'.format(k, v)
                         for k, v in losses_np.items())
        #if self.iteration % 100 == 0:
        self.logger.info(msg)
        if dist.get_rank() == 0 and self.visualizer:
@ -107,15 +130,16 @@ class DeepSpeech2Trainer(Trainer):
                self.iteration += 1
                self.train_batch()
-                if self.iteration % self.config.training.valid_interval == 0:
+                # if self.iteration % self.config.training.valid_interval == 0:
-                    self.valid()
+                #     self.valid()
-                if self.iteration % self.config.training.save_interval == 0:
+                # if self.iteration % self.config.training.save_interval == 0:
-                    self.save()
+                #     self.save()
            except StopIteration:
                self.iteration -= 1  #epoch end, iteration ahead 1
                self.valid()
                self.save()
                self.lr_scheduler.step()
                self.new_epoch()
    def compute_metrics(self, inputs, outputs):
@ -128,11 +152,14 @@ class DeepSpeech2Trainer(Trainer):
        valid_losses = defaultdict(list)
        for i, batch in enumerate(self.valid_loader):
            audio, text, audio_len, text_len = batch
            batch_size = audio.shape[0]
            outputs = self.model(audio, text, audio_len, text_len)
            loss = self.compute_losses(batch, outputs)
            metrics = self.compute_metrics(batch, outputs)
            valid_losses['val_loss'].append(float(loss))
            valid_losses['val_loss_div_batchsize'].append(
                float(loss) / batch_size)
        # write visual log
        valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
@ -166,19 +193,33 @@ class DeepSpeech2Trainer(Trainer):
        grad_clip = paddle.nn.ClipGradByGlobalNorm(
            config.training.global_grad_clip)
        # optimizer = paddle.optimizer.Adam(
        #     learning_rate=config.training.lr,
        #     parameters=model.parameters(),
        #     weight_decay=paddle.regularizer.L2Decay(
        #         config.training.weight_decay),
        #     grad_clip=grad_clip)
        #learning_rate=fluid.layers.exponential_decay(
        #    learning_rate=learning_rate,
        #    decay_steps=num_samples / batch_size / dev_count,
        #    decay_rate=0.83,
        #    staircase=True),
        lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
            learning_rate=config.training.lr, gamma=0.83, verbose=True)
        optimizer = paddle.optimizer.Adam(
-            learning_rate=config.training.lr,
+            learning_rate=lr_scheduler,
            parameters=model.parameters(),
            weight_decay=paddle.regularizer.L2Decay(
                config.training.weight_decay),
            grad_clip=grad_clip)
        criterion = DeepSpeech2Loss(self.train_loader.dataset.vocab_size)
        self.model = model
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler
        self.criterion = criterion
-        self.logger.info("Setup model/optimizer/criterion!")
+        self.logger.info("Setup model/optimizer/lr_scheduler/criterion!")
    def setup_dataloader(self):
        config = self.config
--- a/model_utils/network.py
+++ b/model_utils/network.py
@ -29,6 +29,28 @@ from decoders.swig_wrapper import ctc_beam_search_decoder_batch
 __all__ = ['DeepSpeech2', 'DeepSpeech2Loss']
 def ctc_loss(log_probs,
             labels,
             input_lengths,
             label_lengths,
             blank=0,
             reduction='mean',
             norm_by_times=True):
    #print("my ctc loss with norm by times")
    loss_out = paddle.fluid.layers.warpctc(log_probs, labels, blank, norm_by_times,
                                    input_lengths, label_lengths)
    loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
    assert reduction in ['mean', 'sum', 'none']
    if reduction == 'mean':
        loss_out = paddle.mean(loss_out / label_lengths)
    elif reduction == 'sum':
        loss_out = paddle.sum(loss_out)
    return loss_out
 F.ctc_loss = ctc_loss
 def brelu(x, t_min=0.0, t_max=24.0, name=None):
    t_min = paddle.to_tensor(t_min)
    t_max = paddle.to_tensor(t_max)
@ -683,12 +705,13 @@ class DeepSpeech2Loss(nn.Layer):
        # last token id as blank id
        self.loss = nn.CTCLoss(blank=vocab_size, reduction='none')
-    def forward(self, logits, text, audio_len, text_len):
+    def forward(self, logits, text, logits_len, text_len):
        # warp-ctc do softmax on activations
        # warp-ctc need activation with shape [T, B, V + 1]
        logits = logits.transpose([1, 0, 2])
-        ctc_loss = self.loss(logits, text, audio_len, text_len)
+        ctc_loss = self.loss(logits, text, logits_len, text_len)
-        ctc_loss /= text_len  # norm_by_times
+        ## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403
        #ctc_loss /= logits_len  # norm_by_times
        ctc_loss = ctc_loss.sum()
        return ctc_loss
--- a/training/trainer.py
+++ b/training/trainer.py
@ -161,9 +161,10 @@ class Trainer():
    def new_epoch(self):
        """Reset the train loader and increment ``epoch``.
        """
        self.epoch += 1
        if self.parallel:
            # batch sampler epoch start from 0
            self.train_loader.batch_sampler.set_epoch(self.epoch)
        self.epoch += 1
        self.iterator = iter(self.train_loader)
    def train(self):
@ -246,22 +247,30 @@ class Trainer():
        the standard output and a text file named ``worker_n.log`` in the 
        output directory, where ``n`` means the rank of the process. 
        """
        format = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
        logger = logging.getLogger(__name__)
        logger.setLevel("INFO")
-        formatter = logging.Formatter(
+        formatter = logging.Formatter(fmt=format, datefmt='%Y/%m/%d %H:%M:%S')
            fmt='[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s',
            datefmt='%Y/%m/%d %H:%M:%S')
-        stream_handler = logging.StreamHandler()
+        #stream_handler = logging.StreamHandler()
-        stream_handler.setFormatter(formatter)
+        #stream_handler.setFormatter(formatter)
-        logger.addHandler(stream_handler)
+        #logger.addHandler(stream_handler)
        log_file = self.output_dir / 'worker_{}.log'.format(dist.get_rank())
        file_handler = logging.FileHandler(str(log_file))
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
        # global logger
        stdout = True
        save_path = '/dev/null'
        logging.basicConfig(
            level=logging.DEBUG if stdout else logging.INFO,
            format=format,
            datefmt='%Y/%m/%d %H:%M:%S',
            filename=save_path if not stdout else None)
        self.logger = logger
    @mp_tools.rank_zero_only