add lr_deacy

refactor code
5 years ago · 76290375b7
parent e80c741131
commit 76290375b7
14 changed files with 50 additions and 117 deletions
--- a/README.md
+++ b/README.md
@ -232,7 +232,7 @@ In order to inform the trainer of what augmentation components are needed and wh

 When the `--augment_conf_file` argument of `trainer.py` is set to the path of the above example configuration file, every audio clip in every epoch will be processed: with 60% of chance, it will first be speed perturbed with a uniformly random sampled speed-rate between 0.95 and 1.05, and then with 80% of chance it will be shifted in time with a random sampled offset between -5 ms and 5 ms. Finally this newly synthesized audio clip will be feed into the feature extractor for further training.

-For other configuration examples, please refer to `conf/augmenatation.config.example`.
+For other configuration examples, please refer to `examples/conf/augmentation.config.example`.

 Be careful when utilizing the data augmentation technique, as improper augmentation will do harm to the training, due to the enlarged train-test gap.

--- a/README_cn.md
+++ b/README_cn.md
@ -232,7 +232,7 @@ python3 train.py --help

 当`trainer.py`的`--augment_conf_file`参数被设置为上述示例配置文件的路径时，每个 epoch 中的每个音频片段都将被处理。首先，均匀随机采样速率会有60％的概率在 0.95 和 1.05 之间对音频片段进行速度扰动。然后，音频片段有 80％ 的概率在时间上被挪移，挪移偏差值是 -5 毫秒和 5 毫秒之间的随机采样。最后，这个新合成的音频片段将被传送给特征提取器，以用于接下来的训练。

-有关其他配置实例，请参考`conf/augmenatation.config.example`.
+有关其他配置实例，请参考`examples/conf/augmentation.config.example`.

 使用数据增强技术时要小心，由于扩大了训练和测试集的差异，不恰当的增强会对训练模型不利，导致训练和预测的差距增大。

--- a/examples/aishell/conf/deepspeech2.yaml
+++ b/examples/aishell/conf/deepspeech2.yaml
@ -31,14 +31,15 @@ model:
 training:
  n_epoch: 20
  lr: 5e-4
+  lr_decay: 1.0
  weight_decay: 1e-06
-  global_grad_clip: 400.0
+  global_grad_clip: 5.0
  max_iteration: 500000
  plot_interval: 1000
  save_interval: 1000
  valid_interval: 1000
 decoding:
-  batch_size: 10
+  batch_size: 128
  error_rate_type: cer 
  decoding_method: ctc_beam_search
  lang_model_path: models/lm/zh_giga.no_cna_cmn.prune01244.klm
--- a/examples/conf/augmentation.config
+++ b/examples/conf/augmentation.config
--- a/examples/conf/augmentation.config.example
+++ b/examples/conf/augmentation.config.example
--- a/examples/tiny/conf/deepspeech2.yaml
+++ b/examples/tiny/conf/deepspeech2.yaml
@ -31,8 +31,9 @@ model:
 training:
  n_epoch: 20
  lr: 1e-5 
+  lr_decay: 1.0 
  weight_decay: 1e-06
-  global_grad_clip: 400.0
+  global_grad_clip: 5.0
  max_iteration: 500000
  plot_interval: 1000
  save_interval: 1000
--- a/examples/tiny/local/run_infer.sh
+++ b/examples/tiny/local/run_infer.sh
--- a/examples/tiny/local/train.sh
+++ b/examples/tiny/local/train.sh
@ -3,10 +3,10 @@
 export FLAGS_sync_nccl_allreduce=0

 #CUDA_VISIBLE_DEVICES=0,1,2,3 \
-CUDA_VISIBLE_DEVICES=0,1 \
+CUDA_VISIBLE_DEVICES=0 \
 python3 -u ${MAIN_ROOT}/train.py \
 --device 'gpu' \
--nproc 2 \
+--nproc 1 \
 --config conf/deepspeech2.yaml \
 --output ckpt

--- a/model_utils/config.py
+++ b/model_utils/config.py
@ -53,8 +53,9 @@ _C.model = CN(
 _C.training = CN(
    dict(
        lr=5e-4,  # learning rate
+        lr_decay=1.0,  # learning rate decay
        weight_decay=1e-6,  # the coeff of weight decay
-        global_grad_clip=400.0,  # the global norm clip
+        global_grad_clip=5.0,  # the global norm clip
        plot_interval=1000,  # plot attention and spectrogram by step
        valid_interval=1000,  # validation by step
        save_interval=1000,  # checkpoint by step
--- a/model_utils/model.py
+++ b/model_utils/model.py
@ -250,25 +250,15 @@ class DeepSpeech2Trainer(Trainer):
        print_params(model, self.logger)

        grad_clip = MyClipGradByGlobalNorm(config.training.global_grad_clip)
-
-        # optimizer = paddle.optimizer.Adam(
-        #     learning_rate=config.training.lr,
-        #     parameters=model.parameters(),
-        #     weight_decay=paddle.regularizer.L2Decay(
-        #         config.training.weight_decay),
-        #     grad_clip=grad_clip)
-
-        #learning_rate=fluid.layers.exponential_decay(
-        #    learning_rate=learning_rate,
-        #    decay_steps=num_samples / batch_size / dev_count,
-        #    decay_rate=0.83,
-        #    staircase=True),
-
        lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
-            learning_rate=config.training.lr, gamma=0.83, verbose=True)
+            learning_rate=config.training.lr,
+            gamma=config.training.lr_decay,
+            verbose=True)
        optimizer = paddle.optimizer.Adam(
            learning_rate=lr_scheduler,
            parameters=model.parameters(),
+            weight_decay=paddle.regularizer.L2Decay(
+                config.training.weight_decay),
            grad_clip=grad_clip)

        criterion = DeepSpeech2Loss(self.train_loader.dataset.vocab_size)
@ -458,22 +448,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            output_dir = Path(self.args.output).expanduser() / "infer"
            output_dir.mkdir(parents=True, exist_ok=True)
        else:
-            output_dir = Path(self.args.checkpoint_path).expanduser().parent / "infer"
+            output_dir = Path(
+                self.args.checkpoint_path).expanduser().parent / "infer"
            output_dir.mkdir(parents=True, exist_ok=True)

        self.output_dir = output_dir

-    # def setup_checkpointer(self):
-    #     """Create a directory used to save checkpoints into.
-        
-    #     It is "checkpoints" inside the output directory.
-    #     """
-    #     # checkpoint dir
-    #     checkpoint_dir = self.output_dir / "checkpoints"
-    #     checkpoint_dir.mkdir(exist_ok=True)
-
-    #     self.checkpoint_dir = checkpoint_dir
-
    def setup(self):
        """Setup the experiment.
        """
@ -506,7 +486,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            num_rnn_layers=config.model.num_rnn_layers,
            rnn_size=config.model.rnn_layer_size,
            share_rnn_weights=config.model.share_rnn_weights)
-        
+
        if self.parallel:
            model = paddle.DataParallel(model)

--- a/tools/_init_paths.py
+++ b/tools/_init_paths.py
@ -1,29 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Set up paths for DS2"""
-
-import os.path
-import sys
-
-
-def add_path(path):
-    if path not in sys.path:
-        sys.path.insert(0, path)
-
-
-this_dir = os.path.dirname(__file__)
-
-# Add project path to PYTHONPATH
-proj_path = os.path.join(this_dir, '..')
-add_path(proj_path)
--- a/training/trainer.py
+++ b/training/trainer.py
@ -14,6 +14,7 @@

 import time
 import logging
+import logging.handlers
 from pathlib import Path
 import numpy as np
 from collections import defaultdict
@ -249,7 +250,22 @@ class Trainer():
        Each process has its own text logger. The logging message is write to 
        the standard output and a text file named ``worker_n.log`` in the 
        output directory, where ``n`` means the rank of the process. 
+        when - how to split the log file by time interval
+            'S' : Seconds
+            'M' : Minutes
+            'H' : Hours
+            'D' : Days
+            'W' : Week day
+            default value: 'D'
+        format - format of the log
+            default format:
+            %(levelname)s: %(asctime)s: %(filename)s:%(lineno)d * %(thread)d %(message)s
+            INFO: 12-09 18:02:42: log.py:40 * 139814749787872 HELLO WORLD
+        backup - how many backup file to keep
+            default value: 7
        """
+        when = 'D'
+        backup = 7
        format = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'

        logger = logging.getLogger(__name__)
@ -270,6 +286,12 @@ class Trainer():
        # file_handler.setFormatter(formatter)
        # logger.addHandler(file_handler)

+        handler = logging.handlers.TimedRotatingFileHandler(
+            str(self.output_dir / "warning.log"), when=when, backupCount=backup)
+        handler.setLevel(logging.WARNING)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
        # global logger
        stdout = False
        save_path = log_file
--- a/tools/tune.py
+++ b/tools/tune.py
@ -34,11 +34,9 @@ add_arg('num_batches',      int,    -1,     "# of batches tuning on. "
                                            "Default -1, on whole dev set.")
 add_arg('batch_size',       int,    256,    "# of samples per batch.")
 add_arg('trainer_count',    int,    8,      "# of Trainers (CPUs or GPUs).")
+
 add_arg('beam_size',        int,    500,    "Beam search width.")
 add_arg('num_proc_bsearch', int,    8,     "# of CPUs for beam search.")
-add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
-add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
-add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
 add_arg('num_alphas',       int,    45,     "# of alpha candidates for tuning.")
 add_arg('num_betas',        int,    8,      "# of beta candidates for tuning.")
 add_arg('alpha_from',       float,  1.0,    "Where alpha starts tuning from.")
@ -47,10 +45,15 @@ add_arg('beta_from',        float,  0.1,    "Where beta starts tuning from.")
 add_arg('beta_to',          float,  0.45,   "Where beta ends tuning with.")
 add_arg('cutoff_prob',      float,  1.0,    "Cutoff probability for pruning.")
 add_arg('cutoff_top_n',     int,    40,     "Cutoff number for pruning.")
+
+add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
+add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
+add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
 add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
 add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
 add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
                                            "bi-directional RNNs. Not for GRU.")
+
 add_arg('tune_manifest',    str,
        'data/librispeech/manifest.dev-clean',
        "Filepath of manifest to tune.")
@ -127,6 +130,8 @@ def tune():

    err_sum = [0.0 for i in range(len(params_grid))]
    err_ave = [0.0 for i in range(len(params_grid))]
+
+
    num_ins, len_refs, cur_batch = 0, 0, 0
    # initialize external scorer
    ds2_model.init_ext_scorer(args.alpha_from, args.beta_from,
@ -156,6 +161,7 @@ def tune():
            for target, result in zip(target_transcripts, result_transcripts):
                errors, len_ref = errors_func(target, result)
                err_sum[index] += errors
+
                # accumulate the length of references of every batch
                # in the first iteration
                if args.alpha_from == alpha and args.beta_from == beta:
--- a/utils/model_check.py
+++ b/utils/model_check.py
@ -1,49 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import paddle
-import paddle.fluid as fluid
-
-
-def check_cuda(use_cuda, err = \
-    "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
-    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n"
-                                                                                                                     ):
-    """
-    Log error and exit when set use_gpu=true in paddlepaddle
-    cpu version.
-    """
-    try:
-        if use_cuda == True and fluid.is_compiled_with_cuda() == False:
-            print(err)
-            sys.exit(1)
-    except Exception as e:
-        pass
-
-
-def check_version():
-    """
-    Log error and exit when the installed version of paddlepaddle is
-    not satisfied.
-    """
-    err = "PaddlePaddle version 2.0.0 or higher is required, " \
-          "or a suitable develop version is satisfied as well. \n" \
-          "Please make sure the version is good with your code." \
-
-    try:
-        fluid.require_version('2.0.0')
-    except Exception as e:
-        print(err)
-        sys.exit(1)