add train and config

5 years ago · 6cc80c0aff
parent dd3fb069a3
commit 6cc80c0aff
10 changed files with 242 additions and 181 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@
 .DS_Store
 *.pyc
 tools/venv
 dataset
--- a/data_utils/augmentor/augmentation.py
+++ b/data_utils/augmentor/augmentation.py
@ -25,7 +25,7 @@ from data_utils.augmentor.online_bayesian_normalization import \
     OnlineBayesianNormalizationAugmentor
-class AugmentationPipeline(object):
+class AugmentationPipeline():
    """Build a pre-processing pipeline with various augmentation models.Such a
    data augmentation pipeline is oftern leveraged to augment the training
    samples to make the model invariant to certain types of perturbations in the
--- a/data_utils/augmentor/base.py
+++ b/data_utils/augmentor/base.py
@ -16,7 +16,7 @@
 from abc import ABCMeta, abstractmethod
-class AugmentorBase(object):
+class AugmentorBase():
    """Abstract base class for augmentation model (augmentor) class.
    All augmentor classes should inherit from this class, and implement the
    following abstract methods.
--- a/data_utils/dataset.py
+++ b/data_utils/dataset.py
@ -31,8 +31,10 @@ from data_utils.speech import SpeechSegment
 from data_utils.normalizer import FeatureNormalizer
 __all__ = [
-    "DeepSpeech2Dataset", "DeepSpeech2DistributedBatchSampler",
+    "DeepSpeech2Dataset",
-    "DeepSpeech2BatchSampler"
+    "DeepSpeech2DistributedBatchSampler",
    "DeepSpeech2BatchSampler",
    "SpeechCollator",
 ]
@ -46,9 +48,12 @@ class DeepSpeech2Dataset(Dataset):
                 min_duration=0.0,
                 stride_ms=10.0,
                 window_ms=20.0,
                 n_fft=None,
                 max_freq=None,
                 target_sample_rate=16000,
                 specgram_type='linear',
                 use_dB_normalization=True,
                 target_dB=-20,
                 random_seed=0,
                 keep_transcription_text=False):
        super().__init__()
@ -63,8 +68,11 @@ class DeepSpeech2Dataset(Dataset):
            specgram_type=specgram_type,
            stride_ms=stride_ms,
            window_ms=window_ms,
            n_fft=n_fft,
            max_freq=max_freq,
-            use_dB_normalization=use_dB_normalization)
+            target_sample_rate=target_sample_rate,
            use_dB_normalization=use_dB_normalization,
            target_dB=target_dB)
        self._rng = random.Random(random_seed)
        self._keep_transcription_text = keep_transcription_text
        # for caching tar files info
@ -459,6 +467,51 @@ class DeepSpeech2BatchSampler(BatchSampler):
        self.epoch = epoch
 class SpeechCollator():
    def __init__(self, padding_to=-1):
        """
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one bach.
        If ``padding_to`` is -1, the maximun shape in the batch will be used
        as the target shape for padding. Otherwise, `padding_to` will be the
        target shape (only refers to the second axis).
        """
        self._padding_to = padding_to
    def __call__(self, batch):
        new_batch = []
        # get target shape
        max_length = max([audio.shape[1] for audio, _ in batch])
        if self._padding_to != -1:
            if self._padding_to < max_length:
                raise ValueError("If padding_to is not -1, it should be larger "
                                 "than any instance's shape in the batch")
            max_length = self._padding_to
        max_text_length = max([len(text) for _, text in batch])
        # padding
        padded_audios = []
        audio_lens = []
        texts, text_lens = [], []
        for audio, text in batch:
            # audio
            padded_audio = np.zeros([audio.shape[0], max_length])
            padded_audio[:, :audio.shape[1]] = audio
            padded_audios.append(padded_audio)
            audio_lens.append(audio.shape[1])
            # text
            padded_text = np.zeros([max_text_length])
            padded_text[:len(text)] = text
            texts.append(padded_text)
            text_lens.append(len(text))
        padded_audios = np.array(padded_audios).astype('float32')
        audio_lens = np.array(audio_lens).astype('int64')
        texts = np.array(texts).astype('int32')
        text_lens = np.array(text_lens).astype('int64')
        return padded_audios, texts, audio_lens, text_lens
 def create_dataloader(manifest_path,
                      vocab_filepath,
                      mean_std_filepath,
--- a/data_utils/featurizer/audio_featurizer.py
+++ b/data_utils/featurizer/audio_featurizer.py
@ -52,6 +52,7 @@ class AudioFeaturizer(object):
                 specgram_type='linear',
                 stride_ms=10.0,
                 window_ms=20.0,
                 n_fft=None,
                 max_freq=None,
                 target_sample_rate=16000,
                 use_dB_normalization=True,
@ -63,7 +64,7 @@ class AudioFeaturizer(object):
        self._target_sample_rate = target_sample_rate
        self._use_dB_normalization = use_dB_normalization
        self._target_dB = target_dB
-        self._fft_point = None
+        self._fft_point = n_fft
    def featurize(self,
                  audio_segment,
--- a/data_utils/featurizer/speech_featurizer.py
+++ b/data_utils/featurizer/speech_featurizer.py
@ -56,6 +56,7 @@ class SpeechFeaturizer(object):
                 specgram_type='linear',
                 stride_ms=10.0,
                 window_ms=20.0,
                 n_fft=None,
                 max_freq=None,
                 target_sample_rate=16000,
                 use_dB_normalization=True,
@ -64,6 +65,7 @@ class SpeechFeaturizer(object):
            specgram_type=specgram_type,
            stride_ms=stride_ms,
            window_ms=window_ms,
            n_fft=n_fft,
            max_freq=max_freq,
            target_sample_rate=target_sample_rate,
            use_dB_normalization=use_dB_normalization,
--- a/model_utils/config.py
+++ b/model_utils/config.py
@ -0,0 +1,70 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from yacs.config import CfgNode as CN
 _C = CN()
 _C.data = CN(
    dict(
        train_manifest="",
        dev_manifest="",
        test_manifest="",
        vocab_filepath="",
        mean_std_filepath="",
        augmentation_config='{}',
        max_duration=float('inf'),
        min_duration=0.0,
        stride_ms=10.0,  # ms
        window_ms=20.0,  # ms
        n_fft=None,  # fft points
        max_freq=None,  # None for samplerate/2
        specgram_type='linear',  # 'linear', 'mfcc'
        target_sample_rate=16000,  # sample rate
        use_dB_normalization=True,
        target_dB=-20,
        random_seed=0,
        keep_transcription_text=False,
        batch_size=32,  # batch size
        num_workers=0,  # data loader workers
        sortagrad=False,  # sorted in first epoch when True
        shuffle_method="batch_shuffle",  # 'batch_shuffle', 'instance_shuffle'
    ))
 _C.model = CN(
    dict(
        num_conv_layers=2,  #Number of stacking convolution layers.
        num_rnn_layers=3,  #Number of stacking RNN layers.
        rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
        use_gru=True,  #Use gru if set True. Use simple rnn if set False.
        share_rnn_weights=False  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
    ))
 _C.training = CN(
    dict(
        lr=5e-4,  # learning rate
        weight_decay=1e-6,  # the coeff of weight decay
        global_grad_clip=400.0,  # the global norm clip
        plot_interval=1000,  # plot attention and spectrogram by step
        valid_interval=1000,  # validation by step
        save_interval=1000,  # checkpoint by step
        max_iteration=500000,  # max iteration to train by step
        n_epoch=50,  # train epochs
    ))
 def get_cfg_defaults():
    """Get a yacs CfgNode object with default values for my_project."""
    # Return a clone so that the defaults will not be altered
    # This is for the "local variable" use pattern
    return _C.clone()
--- a/model_utils/model.py
+++ b/model_utils/model.py
@ -28,61 +28,15 @@ from distutils.dir_util import mkpath
 import paddle.fluid as fluid
 from training import Trainer
 from model_utils.network import DeepSpeech2
 from model_utils.network import DeepSpeech2Loss
 from model_utils.network import SpeechCollator
 from decoders.swig_wrapper import Scorer
 from decoders.swig_wrapper import ctc_greedy_decoder
 from decoders.swig_wrapper import ctc_beam_search_decoder_batch
 logging.basicConfig(
    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
 class SpeechCollator():
    def __init__(self, padding_to=-1):
        """
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one bach.
        If ``padding_to`` is -1, the maximun shape in the batch will be used
        as the target shape for padding. Otherwise, `padding_to` will be the
        target shape (only refers to the second axis).
        """
        self._padding_to = padding_to
    def __call__(self, batch):
        new_batch = []
        # get target shape
        max_length = max([audio.shape[1] for audio, _ in batch])
        if self._padding_to != -1:
            if self._padding_to < max_length:
                raise ValueError("If padding_to is not -1, it should be larger "
                                 "than any instance's shape in the batch")
            max_length = self._padding_to
        max_text_length = max([len(text) for _, text in batch])
        # padding
        padded_audios = []
        audio_lens = []
        texts, text_lens = [], []
        for audio, text in batch:
            # audio
            padded_audio = np.zeros([audio.shape[0], max_length])
            padded_audio[:, :audio.shape[1]] = audio
            padded_audios.append(padded_audio)
            audio_lens.append(audio.shape[1])
            # text
            padded_text = np.zeros([max_text_length])
            padded_text[:len(text)] = text
            texts.append(padded_text)
            text_lens.append(len(text))
        padded_audios = np.array(padded_audios).astype('float32')
        audio_lens = np.array(audio_lens).astype('int64')
        texts = np.array(texts).astype('int32')
        text_lens = np.array(text_lens).astype('int64')
        return padded_audios, texts, audio_lens, text_lens
 class DeepSpeech2Trainer(Trainer):
    def __init__(self):
@ -92,7 +46,7 @@ class DeepSpeech2Trainer(Trainer):
        config = self.config
        train_dataset = DeepSpeech2Dataset(
-            config.data.train_manifest_path,
+            config.data.train_manifest,
            config.data.vocab_filepath,
            config.data.mean_std_filepath,
            augmentation_config=config.data.augmentation_config,
@ -100,14 +54,17 @@ class DeepSpeech2Trainer(Trainer):
            min_duration=config.data.min_duration,
            stride_ms=config.data.stride_ms,
            window_ms=config.data.window_ms,
            n_fft=config.data.n_fft,
            max_freq=config.data.max_freq,
            target_sample_rate=config.data.target_sample_rate,
            specgram_type=config.data.specgram_type,
            use_dB_normalization=config.data.use_dB_normalization,
            target_dB=config.data.target_dB,
            random_seed=config.data.random_seed,
            keep_transcription_text=False)
        dev_dataset = DeepSpeech2Dataset(
-            config.data.dev_manifest_path,
+            config.data.dev_manifest,
            config.data.vocab_filepath,
            config.data.mean_std_filepath,
            augmentation_config=config.data.augmentation_config,
@ -115,9 +72,12 @@ class DeepSpeech2Trainer(Trainer):
            min_duration=config.data.min_duration,
            stride_ms=config.data.stride_ms,
            window_ms=config.data.window_ms,
            n_fft=config.data.n_fft,
            max_freq=config.data.max_freq,
            target_sample_rate=config.data.target_sample_rate,
            specgram_type=config.data.specgram_type,
            use_dB_normalization=config.data.use_dB_normalization,
            target_dB=config.data.target_dB,
            random_seed=config.data.random_seed,
            keep_transcription_text=False)
@ -167,14 +127,15 @@ class DeepSpeech2Trainer(Trainer):
        if self.parallel:
            model = paddle.DataParallel(model)
-        grad_clip = paddle.nn.ClipGradByGlobalNorm(config.training.grad_clip)
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(
            config.training.global_grad_clip)
        optimizer = paddle.optimizer.Adam(
            learning_rate=config.training.lr,
            parameters=model.parameters(),
            weight_decay=paddle.regulaerizer.L2Decay(
                config.training.weight_decay),
-            grad_clip=grad_clip, )
+            grad_clip=grad_clip)
        criterion = DeepSpeech2Loss(self.train_loader.vocab_size)
@ -255,7 +216,7 @@ class DeepSpeech2Trainer(Trainer):
        """
        self.model.eval()
        audio, text, audio_len, text_len = infer_data
-        logits, probs = self.model.predict(audio, audio_len)
+        _, probs = self.model.predict(audio, audio_len)
        return probs
    def decode_batch_greedy(self, probs_split, vocab_list):
--- a/train.py
+++ b/train.py
@ -16,137 +16,45 @@
 import argparse
 import functools
 import io
 from model_utils.model import DeepSpeech2Model
 from model_utils.model_check import check_cuda, check_version
 from data_utils.data import DataGenerator
 from utils.utility import add_arguments, print_arguments
-import paddle.fluid as fluid
+from utils.model_check import check_cuda, check_version
 from utils.utility import print_arguments
 from training.cli import default_argument_parser
-parser = argparse.ArgumentParser(description=__doc__)
+from model_utils.config import get_cfg_defaults
-add_arg = functools.partial(add_arguments, argparser=parser)
+from model_utils.model import DeepSpeech2Trainer as Trainer
 # yapf: disable
 add_arg('batch_size',       int,    256,    "Minibatch size.")
 add_arg('num_epoch',       int,    200,    "# of training epochs.")
 add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
 add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
 add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
 add_arg('num_iter_print',   int,    100,    "Every # batch for printing "
                                            "train cost.")
 add_arg('save_epoch',   int,    10,   "# Every # batch for save checkpoint and modle params ")
 add_arg('num_samples',    int,    10000,    "The num of train samples.")
 add_arg('learning_rate',    float,  5e-4,   "Learning rate.")
 add_arg('max_duration',     float,  27.0,   "Longest audio duration allowed.")
 add_arg('min_duration',     float,  0.0,    "Shortest audio duration allowed.")
 add_arg('test_off',         bool,   False,  "Turn off testing.")
 add_arg('use_sortagrad',    bool,   True,   "Use SortaGrad or not.")
 add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
 add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
 add_arg('is_local',         bool,   True,   "Use pserver or not.")
 add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
                                           "bi-directional RNNs. Not for GRU.")
 add_arg('init_from_pretrained_model',str,
         None,
         "If None, the training starts from scratch, "
         "otherwise, it resumes from the pre-trained model.")
-add_arg('train_manifest',   str,
+logging.basicConfig(
-        'data/librispeech/manifest.train',
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
        "Filepath of train manifest.")
 add_arg('dev_manifest',     str,
        'data/librispeech/manifest.dev-clean',
        "Filepath of validation manifest.")
 add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('output_model_dir', str,
        "./checkpoints/libri",
        "Directory for saving checkpoints.")
 add_arg('augment_conf_path',str,
        'conf/augmentation.config',
        "Filepath of augmentation configuration file (json-format).")
 add_arg('specgram_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc.",
        choices=['linear', 'mfcc'])
 add_arg('shuffle_method',   str,
        'batch_shuffle_clipped',
        "Shuffle method.",
        choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped'])
 # yapf: disable
 args = parser.parse_args()
-def train():
+def main_sp(config, args):
-    """DeepSpeech2 training."""
+    exp = Trainer(config, args)
    exp.setup()
    exp.run()
 def main(config, args):
    # check if set use_gpu=True in paddlepaddle cpu version
-    check_cuda(args.use_gpu)
+    check_cuda(args.device == 'gpu')
    # check if paddlepaddle version is satisfied
    check_version()
-
+    if args.nprocs > 1 and args.device == "gpu":
-    if args.use_gpu:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
        place = fluid.CUDAPlace(0)
    else:
-        place = fluid.CPUPlace()
+        main_sp(config, args)
-
+
-    train_generator = DataGenerator(
+
-        vocab_filepath=args.vocab_path,
+if __name__ == "__main__":
-        mean_std_filepath=args.mean_std_path,
+    config = get_cfg_defaults()
-        augmentation_config=io.open(args.augment_conf_path, mode='r', encoding='utf8').read(),
+    parser = default_argument_parser()
-        max_duration=args.max_duration,
+    args = parser.parse_args()
-        min_duration=args.min_duration,
+    if args.config:
-        specgram_type=args.specgram_type,
+        config.merge_from_file(args.config)
-        place=place)
+    if args.opts:
-    dev_generator = DataGenerator(
+        config.merge_from_list(args.opts)
-        vocab_filepath=args.vocab_path,
+    config.freeze()
-        mean_std_filepath=args.mean_std_path,
+    print(config)
        augmentation_config="{}",
        specgram_type=args.specgram_type,
        place = place)
    train_batch_reader = train_generator.batch_reader_creator(
        manifest_path=args.train_manifest,
        batch_size=args.batch_size,
        sortagrad=args.use_sortagrad if args.init_from_pretrained_model is None else False,
        shuffle_method=args.shuffle_method)
    dev_batch_reader = dev_generator.batch_reader_creator(
        manifest_path=args.dev_manifest,
        batch_size=args.batch_size,
        sortagrad=False,
        shuffle_method=None)
    ds2_model = DeepSpeech2Model(
        vocab_size=train_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        share_rnn_weights=args.share_rnn_weights,
        place=place,
        init_from_pretrained_model=args.init_from_pretrained_model,
        output_model_dir=args.output_model_dir)
    ds2_model.train(
        train_batch_reader=train_batch_reader,
        dev_batch_reader=dev_batch_reader,
        feeding_dict=train_generator.feeding,
        learning_rate=args.learning_rate,
        gradient_clipping=400,
        batch_size=args.batch_size,
        num_samples=args.num_samples,
        num_epoch=args.num_epoch,
        save_epoch=args.save_epoch,
        num_iterations_print=args.num_iter_print,
        test_off=args.test_off)
 def main():
    print_arguments(args)
    train()
-if __name__ == '__main__':
+    main(config, args)
    main()
--- a/training/cli.py
+++ b/training/cli.py
@ -0,0 +1,64 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 def default_argument_parser():
    r"""A simple yet genral argument parser for experiments with parakeet.
    This is used in examples with parakeet. And it is intended to be used by 
    other experiments with parakeet. It requires a minimal set of command line 
    arguments to start a training script.
    The ``--config`` and ``--opts`` are used for overwrite the deault 
    configuration.
    The ``--data`` and ``--output`` specifies the data path and output path. 
    Resuming training from existing progress at the output directory is the 
    intended default behavior.
    The ``--checkpoint_path`` specifies the checkpoint to load from.
    The ``--device`` and ``--nprocs`` specifies how to run the training.
    See Also
    --------
    parakeet.training.experiment
    Returns
    -------
    argparse.ArgumentParser
        the parser
    """
    parser = argparse.ArgumentParser()
    # yapf: disable
    # data and output
    parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.")
    parser.add_argument("--data", metavar="DATA_DIR", help="path to the datatset.")
    parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.")
    # load from saved checkpoint
    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")
    # running
    parser.add_argument("--device", type=str, choices=["cpu", "gpu"], help="device type to use, cpu and gpu are supported.")
    parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.")
    # overwrite extra config and default config
    parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
    # yapd: enable
    return parser