add config and train script

5 years ago · 4dc75c40c9
parent 6cc80c0aff
commit 4dc75c40c9
12 changed files with 251 additions and 84 deletions
--- a/data_utils/dataset.py
+++ b/data_utils/dataset.py
@ -109,6 +109,10 @@ class DeepSpeech2Dataset(Dataset):
        """
        return self._speech_featurizer.vocab_list
    @property
    def feature_size(self):
        return self._speech_featurizer.feature_size
    def _parse_tar(self, file):
        """Parse a tar file to get a tarfile object
        and a map containing tarinfoes
@ -200,7 +204,7 @@ class DeepSpeech2DistributedBatchSampler(DistributedBatchSampler):
        self._sortagrad = sortagrad
        self._shuffle_method = shuffle_method
-    def _batch_shuffle(self, manifest, batch_size, clipped=False):
+    def _batch_shuffle(self, indices, batch_size, clipped=False):
        """Put similarly-sized instances into minibatches for better efficiency
        and make a batch-wise shuffle.
@ -210,8 +214,8 @@ class DeepSpeech2DistributedBatchSampler(DistributedBatchSampler):
           for different epochs. Create minibatches.
        4. Shuffle the minibatches.
-        :param manifest: Manifest contents. List of dict.
+        :param indices: indexes. List of int.
-        :type manifest: list
+        :type indices: list
        :param batch_size: Batch size. This size is also used for generate
                           a random number for batch shuffle.
        :type batch_size: int
@ -222,16 +226,16 @@ class DeepSpeech2DistributedBatchSampler(DistributedBatchSampler):
        :rtype: list
        """
        rng = np.random.RandomState(self.epoch)
        manifest.sort(key=lambda x: x["duration"])
        shift_len = rng.randint(0, batch_size - 1)
-        batch_manifest = list(zip(*[iter(manifest[shift_len:])] * batch_size))
+        batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
-        rng.shuffle(batch_manifest)
+        rng.shuffle(batch_indices)
-        batch_manifest = [item for batch in batch_manifest for item in batch]
+        batch_indices = [item for batch in batch_indices for item in batch]
        assert (clipped == False)
        if not clipped:
-            res_len = len(manifest) - shift_len - len(batch_manifest)
+            res_len = len(indices) - shift_len - len(batch_indices)
-            batch_manifest.extend(manifest[-res_len:])
+            batch_indices.extend(indices[-res_len:])
-            batch_manifest.extend(manifest[0:shift_len])
+            batch_indices.extend(indices[0:shift_len])
-        return batch_manifest
+        return batch_indices
    def __iter__(self):
        num_samples = len(self.dataset)
@ -336,7 +340,7 @@ class DeepSpeech2BatchSampler(BatchSampler):
        self._sortagrad = sortagrad
        self._shuffle_method = shuffle_method
-    def _batch_shuffle(self, manifest, batch_size, clipped=False):
+    def _batch_shuffle(self, indices, batch_size, clipped=False):
        """Put similarly-sized instances into minibatches for better efficiency
        and make a batch-wise shuffle.
@ -346,8 +350,8 @@ class DeepSpeech2BatchSampler(BatchSampler):
           for different epochs. Create minibatches.
        4. Shuffle the minibatches.
-        :param manifest: Manifest contents. List of dict.
+        :param indices: indexes. List of int.
-        :type manifest: list
+        :type indices: list
        :param batch_size: Batch size. This size is also used for generate
                           a random number for batch shuffle.
        :type batch_size: int
@ -358,16 +362,16 @@ class DeepSpeech2BatchSampler(BatchSampler):
        :rtype: list
        """
        rng = np.random.RandomState(self.epoch)
        manifest.sort(key=lambda x: x["duration"])
        shift_len = rng.randint(0, batch_size - 1)
-        batch_manifest = list(zip(*[iter(manifest[shift_len:])] * batch_size))
+        batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
-        rng.shuffle(batch_manifest)
+        rng.shuffle(batch_indices)
-        batch_manifest = [item for batch in batch_manifest for item in batch]
+        batch_indices = [item for batch in batch_indices for item in batch]
        assert (clipped == False)
        if not clipped:
-            res_len = len(manifest) - shift_len - len(batch_manifest)
+            res_len = len(indices) - shift_len - len(batch_indices)
-            batch_manifest.extend(manifest[-res_len:])
+            batch_indices.extend(indices[-res_len:])
-            batch_manifest.extend(manifest[0:shift_len])
+            batch_indices.extend(indices[0:shift_len])
-        return batch_manifest
+        return batch_indices
    def __iter__(self):
        num_samples = len(self.dataset)
@ -377,7 +381,7 @@ class DeepSpeech2BatchSampler(BatchSampler):
        # sort (by duration) or batch-wise shuffle the manifest
        if self.shuffle:
-            if self.epoch == 0 and self.sortagrad:
+            if self.epoch == 0 and self._sortagrad:
                pass
            else:
                if self._shuffle_method == "batch_shuffle":
--- a/data_utils/featurizer/audio_featurizer.py
+++ b/data_utils/featurizer/audio_featurizer.py
@ -103,15 +103,19 @@ class AudioFeaturizer(object):
    @property
    def feature_size(self):
        """audio feature size"""
        feat_dim = 0
        if self._specgram_type == 'linear':
            fft_point = self._window_ms if self._fft_point is None else self._fft_point
-            return fft_point * (self._target_sample_rate / 1000) / 2 + 1
+            feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
                           1)
        elif self._specgram_type == 'mfcc':
            # mfcc,delta, delta-delta
-            return 13 * 3
+            feat_dim = int(13 * 3)
        else:
            raise ValueError("Unknown specgram_type %s. "
                             "Supported values: linear." % self._specgram_type)
        print('feat_dim:', feat_dim)
        return feat_dim
    def _compute_specgram(self, samples, sample_rate):
        """Extract various audio features."""
--- a/examples/tiny/conf/augmentation.config
+++ b/examples/tiny/conf/augmentation.config
@ -0,0 +1,8 @@
 [
    {
        "type": "shift",
        "params": {"min_shift_ms": -5,
                   "max_shift_ms": 5},
        "prob": 1.0
    }
 ]
--- a/examples/tiny/conf/deepspeech2.yaml
+++ b/examples/tiny/conf/deepspeech2.yaml
@ -0,0 +1,39 @@
 # https://yaml.org/type/float.html
 data:
  train_manifest: data/manifest.tiny
  dev_manifest: data/manifest.tiny
  test_manifest: data/manifest.tiny
  mean_std_filepath: data/mean_std.npz
  vocab_filepath: data/vocab.txt 
  augmentation_config: conf/augmentation.config
  batch_size: 4
  max_duration: 27.0
  min_duration: 0.0
  specgram_type: linear
  target_sample_rate: 16000
  max_freq: None
  n_fft: None
  stride_ms: 10.0
  window_ms: 20.0
  use_dB_normalization: True
  target_dB: -20
  random_seed: 0
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 0
 model:
  num_conv_layers: 2
  num_rnn_layers: 3
  rnn_layer_size: 2048 
  use_gru: False 
  share_rnn_weights: True 
 training:
  n_epoch: 20
  lr: 1e-5 
  weight_decay: 1e-06
  global_grad_clip: 400.0
  max_iteration: 500000
  plot_interval: 1000
  save_interval: 1000
  valid_interval: 1000
--- a/examples/tiny/local/run_train.sh
+++ b/examples/tiny/local/run_train.sh
@ -3,33 +3,23 @@
 # train model
 # if you wish to resume from an exists model, uncomment --init_from_pretrained_model
 export FLAGS_sync_nccl_allreduce=0
-CUDA_VISIBLE_DEVICES=0,1,2,3 \
+
 #CUDA_VISIBLE_DEVICES=0,1,2,3 \
 #python3 -u ${MAIN_ROOT}/train.py \
 #--num_iter_print=1 \
 #--save_epoch=1 \
 #--num_samples=64 \
 #--test_off=False \
 #--is_local=True \
 #--output_model_dir="./checkpoints/" \
 #--shuffle_method="batch_shuffle_clipped" \
 #CUDA_VISIBLE_DEVICES=0,1,2,3 \
 CUDA_VISIBLE_DEVICES=1,2,3 \
 python3 -u ${MAIN_ROOT}/train.py \
--batch_size=4 \
+--nproc 1 \
--num_epoch=20 \
+--config conf/deepspeech2.yaml \
--num_conv_layers=2 \
+--output ckpt
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
 --num_iter_print=1 \
 --save_epoch=1 \
 --num_samples=64 \
 --learning_rate=1e-5 \
 --max_duration=27.0 \
 --min_duration=0.0 \
 --test_off=False \
 --use_sortagrad=True \
 --use_gru=False \
 --use_gpu=True \
 --is_local=True \
 --share_rnn_weights=True \
 --train_manifest="data/manifest.tiny" \
 --dev_manifest="data/manifest.tiny" \
 --mean_std_path="data/mean_std.npz" \
 --vocab_path="data/vocab.txt" \
 --output_model_dir="./checkpoints/" \
 --augment_conf_path="${MAIN_ROOT}/conf/augmentation.config" \
 --specgram_type="linear" \
 --shuffle_method="batch_shuffle_clipped" \
 if [ $? -ne 0 ]; then
    echo "Failed in training!"
--- a/model_utils/config.py
+++ b/model_utils/config.py
@ -22,7 +22,7 @@ _C.data = CN(
        test_manifest="",
        vocab_filepath="",
        mean_std_filepath="",
-        augmentation_config='{}',
+        augmentation_config="",
        max_duration=float('inf'),
        min_duration=0.0,
        stride_ms=10.0,  # ms
--- a/model_utils/model.py
+++ b/model_utils/model.py
@ -13,25 +13,26 @@
 # limitations under the License.
 """Contains DeepSpeech2 model."""
 import io
 import sys
 import os
 import time
 import logging
 import gzip
 import copy
 import inspect
 import collections
 import multiprocessing
 import numpy as np
 from distutils.dir_util import mkpath
-import paddle.fluid as fluid
+import paddle
 from paddle import distributed as dist
 from paddle.io import DataLoader
 from utils import mp_tools
 from training import Trainer
 from model_utils.network import DeepSpeech2
 from model_utils.network import DeepSpeech2Loss
-from model_utils.network import SpeechCollator
+
 from data_utils.dataset import SpeechCollator
 from data_utils.dataset import DeepSpeech2Dataset
 from data_utils.dataset import DeepSpeech2DistributedBatchSampler
 from data_utils.dataset import DeepSpeech2BatchSampler
 from decoders.swig_wrapper import Scorer
 from decoders.swig_wrapper import ctc_greedy_decoder
@ -39,7 +40,8 @@ from decoders.swig_wrapper import ctc_beam_search_decoder_batch
 class DeepSpeech2Trainer(Trainer):
-    def __init__(self):
+    def __init__(self, config, args):
        super().__init__(config, args)
        self._ext_scorer = None
    def setup_dataloader(self):
@ -49,7 +51,9 @@ class DeepSpeech2Trainer(Trainer):
            config.data.train_manifest,
            config.data.vocab_filepath,
            config.data.mean_std_filepath,
-            augmentation_config=config.data.augmentation_config,
+            augmentation_config=io.open(
                config.data.augmentation_config, mode='r',
                encoding='utf8').read(),
            max_duration=config.data.max_duration,
            min_duration=config.data.min_duration,
            stride_ms=config.data.stride_ms,
@ -67,7 +71,7 @@ class DeepSpeech2Trainer(Trainer):
            config.data.dev_manifest,
            config.data.vocab_filepath,
            config.data.mean_std_filepath,
-            augmentation_config=config.data.augmentation_config,
+            augmentation_config="{}",
            max_duration=config.data.max_duration,
            min_duration=config.data.min_duration,
            stride_ms=config.data.stride_ms,
@ -117,8 +121,8 @@ class DeepSpeech2Trainer(Trainer):
    def setup_model(self):
        config = self.config
        model = DeepSpeech2(
-            feat_size=self.train_loader.feature_size,
+            feat_size=self.train_loader.dataset.feature_size,
-            dict_size=self.train_loader.vocab_size,
+            dict_size=self.train_loader.dataset.vocab_size,
            num_conv_layers=config.model.num_conv_layers,
            num_rnn_layers=config.model.num_rnn_layers,
            rnn_size=config.model.rnn_layer_size,
@ -133,11 +137,11 @@ class DeepSpeech2Trainer(Trainer):
        optimizer = paddle.optimizer.Adam(
            learning_rate=config.training.lr,
            parameters=model.parameters(),
-            weight_decay=paddle.regulaerizer.L2Decay(
+            weight_decay=paddle.regularizer.L2Decay(
                config.training.weight_decay),
            grad_clip=grad_clip)
-        criterion = DeepSpeech2Loss(self.train_loader.vocab_size)
+        criterion = DeepSpeech2Loss(self.train_loader.dataset.vocab_size)
        self.model = model
        self.optimizer = optimizer
--- a/model_utils/network2_test.py
+++ b/model_utils/network2_test.py
@ -0,0 +1,104 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from network2 import DeepSpeech2
 import paddle
 import numpy as np
 if __name__ == '__main__':
    batch_size = 2
    feat_dim = 161
    max_len = 100
    audio = np.random.randn(batch_size, feat_dim, max_len)
    audio_len = np.random.randint(100, size=batch_size, dtype='int32')
    audio_len[-1] = 100
    text = np.array([[1, 2], [1, 2]], dtype='int32')
    text_len = np.array([2] * batch_size, dtype='int32')
    place = paddle.CUDAPinnedPlace()
    audio = paddle.to_tensor(
        audio, dtype='float32', place=place, stop_gradient=True)
    audio_len = paddle.to_tensor(
        audio_len, dtype='int64', place=place, stop_gradient=True)
    text = paddle.to_tensor(
        text, dtype='int32', place=place, stop_gradient=True)
    text_len = paddle.to_tensor(
        text_len, dtype='int64', place=place, stop_gradient=True)
    print(audio.shape)
    print(audio_len.shape)
    print(text.shape)
    print(text_len.shape)
    print("-----------------")
    model = DeepSpeech2(
        feat_size=feat_dim,
        dict_size=10,
        num_conv_layers=2,
        num_rnn_layers=3,
        rnn_size=1024,
        use_gru=False,
        share_rnn_weights=False, )
    probs = model(audio, text, audio_len, text_len)
    print('probs.shape', probs.shape)
    print("-----------------")
    model2 = DeepSpeech2(
        feat_size=feat_dim,
        dict_size=10,
        num_conv_layers=2,
        num_rnn_layers=3,
        rnn_size=1024,
        use_gru=True,
        share_rnn_weights=False, )
    probs = model2(audio, text, audio_len, text_len)
    print('probs.shape', probs.shape)
    print("-----------------")
    model3 = DeepSpeech2(
        feat_size=feat_dim,
        dict_size=10,
        num_conv_layers=2,
        num_rnn_layers=3,
        rnn_size=1024,
        use_gru=False,
        share_rnn_weights=True, )
    probs = model3(audio, text, audio_len, text_len)
    print('probs.shape', probs.shape)
    print("-----------------")
    model4 = DeepSpeech2(
        feat_size=feat_dim,
        dict_size=10,
        num_conv_layers=2,
        num_rnn_layers=3,
        rnn_size=1024,
        use_gru=True,
        share_rnn_weights=True, )
    probs = model4(audio, text, audio_len, text_len)
    print('probs.shape', probs.shape)
    print("-----------------")
    model5 = DeepSpeech2(
        feat_size=feat_dim,
        dict_size=10,
        num_conv_layers=2,
        num_rnn_layers=3,
        rnn_size=1024,
        use_gru=False,
        share_rnn_weights=False, )
    probs = model5(audio, text, audio_len, text_len)
    print('probs.shape', probs.shape)
    print("-----------------")
--- a/requirements.txt
+++ b/requirements.txt
@ -2,3 +2,5 @@ scipy==1.2.1
 resampy==0.2.2
 SoundFile==0.9.0.post1
 python_speech_features
 tensorboardX
 yacs
--- a/train.py
+++ b/train.py
@ -13,20 +13,19 @@
 # limitations under the License.
 """Trainer for DeepSpeech2 model."""
 import io
 import logging
 import argparse
 import functools
 import io
-from utils.model_check import check_cuda, check_version
+from paddle import distributed as dist
 from utils.utility import print_arguments
 from training.cli import default_argument_parser
 from model_utils.config import get_cfg_defaults
 from model_utils.model import DeepSpeech2Trainer as Trainer
 logging.basicConfig(
    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
 def main_sp(config, args):
    exp = Trainer(config, args)
@ -35,26 +34,27 @@ def main_sp(config, args):
 def main(config, args):
-    # check if set use_gpu=True in paddlepaddle cpu version
+    if args.device == "gpu" and args.nprocs > 1:
    check_cuda(args.device == 'gpu')
    # check if paddlepaddle version is satisfied
    check_version()
    if args.nprocs > 1 and args.device == "gpu":
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)
 if __name__ == "__main__":
    config = get_cfg_defaults()
    parser = default_argument_parser()
    args = parser.parse_args()
    print_arguments(args)
    # https://yaml.org/type/float.html
    config = get_cfg_defaults()
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:
        config.merge_from_list(args.opts)
    config.freeze()
    print(config)
-    print_arguments(args)
+    if args.dump_config:
        with open(args.dump_config, 'w') as f:
            print(config, file=f)
    main(config, args)
--- a/training/cli.py
+++ b/training/cli.py
@ -47,6 +47,7 @@ def default_argument_parser():
    # yapf: disable
    # data and output
    parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.")
    parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.")
    parser.add_argument("--data", metavar="DATA_DIR", help="path to the datatset.")
    parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.")
@ -54,7 +55,7 @@ def default_argument_parser():
    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")
    # running
-    parser.add_argument("--device", type=str, choices=["cpu", "gpu"], help="device type to use, cpu and gpu are supported.")
+    parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"], help="device type to use, cpu and gpu are supported.")
    parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.")
    # overwrite extra config and default config
--- a/training/trainer.py
+++ b/training/trainer.py
@ -20,6 +20,7 @@ from collections import defaultdict
 import paddle
 from paddle import distributed as dist
 from paddle.distributed.utils import get_gpus
 from tensorboardX import SummaryWriter
 from utils import checkpoint
@ -238,9 +239,19 @@ class Trainer():
        """
        logger = logging.getLogger(__name__)
        logger.setLevel("INFO")
-        logger.addHandler(logging.StreamHandler())
+
        formatter = logging.Formatter(
            fmt='[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s',
            datefmt='%Y/%m/%d %H:%M:%S')
        stream_handler = logging.StreamHandler()
        stream_handler.setFormatter(formatter)
        logger.addHandler(stream_handler)
        log_file = self.output_dir / 'worker_{}.log'.format(dist.get_rank())
-        logger.addHandler(logging.FileHandler(str(log_file)))
+        file_handler = logging.FileHandler(str(log_file))
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
        self.logger = logger