PaddleSpeech/train.py

"""Trainer for DeepSpeech2 model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import functools
import io
from model_utils.model import DeepSpeech2Model
from model_utils.model_check import check_cuda, check_version
from data_utils.data import DataGenerator
from utils.utility import add_arguments, print_arguments

import paddle.fluid as fluid

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size',       int,    256,    "Minibatch size.")
add_arg('num_epoch',       int,    200,    "# of training epochs.")
add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
add_arg('num_iter_print',   int,    100,    "Every # batch for printing "
                                            "train cost.")
add_arg('save_epoch',   int,    10,   "# Every # batch for save checkpoint and modle params ")
add_arg('num_samples',    int,    10000,    "The num of train samples.")
add_arg('learning_rate',    float,  5e-4,   "Learning rate.")
add_arg('max_duration',     float,  27.0,   "Longest audio duration allowed.")
add_arg('min_duration',     float,  0.0,    "Shortest audio duration allowed.")
add_arg('test_off',         bool,   False,  "Turn off testing.")
add_arg('use_sortagrad',    bool,   True,   "Use SortaGrad or not.")
add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
add_arg('is_local',         bool,   True,   "Use pserver or not.")
add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
                                           "bi-directional RNNs. Not for GRU.")
add_arg('init_from_pretrained_model',str,
         None,
         "If None, the training starts from scratch, "
         "otherwise, it resumes from the pre-trained model.")

add_arg('train_manifest',   str,
        'data/librispeech/manifest.train',
        "Filepath of train manifest.")
add_arg('dev_manifest',     str,
        'data/librispeech/manifest.dev-clean',
        "Filepath of validation manifest.")
add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
add_arg('vocab_path',       str,
        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
add_arg('output_model_dir', str,
        "./checkpoints/libri",
        "Directory for saving checkpoints.")
add_arg('augment_conf_path',str,
        'conf/augmentation.config',
        "Filepath of augmentation configuration file (json-format).")
add_arg('specgram_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc.",
        choices=['linear', 'mfcc'])
add_arg('shuffle_method',   str,
        'batch_shuffle_clipped',
        "Shuffle method.",
        choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped'])
# yapf: disable
args = parser.parse_args()


def train():
    """DeepSpeech2 training."""

    # check if set use_gpu=True in paddlepaddle cpu version
    check_cuda(args.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    train_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config=io.open(args.augment_conf_path, mode='r', encoding='utf8').read(),
        max_duration=args.max_duration,
        min_duration=args.min_duration,
        specgram_type=args.specgram_type,
        place=place)
    dev_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config="{}",
        specgram_type=args.specgram_type,
        place = place)
    train_batch_reader = train_generator.batch_reader_creator(
        manifest_path=args.train_manifest,
        batch_size=args.batch_size,
        sortagrad=args.use_sortagrad if args.init_from_pretrained_model is None else False,
        shuffle_method=args.shuffle_method)
    dev_batch_reader = dev_generator.batch_reader_creator(
        manifest_path=args.dev_manifest,
        batch_size=args.batch_size,
        sortagrad=False,
        shuffle_method=None)

    ds2_model = DeepSpeech2Model(
        vocab_size=train_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        share_rnn_weights=args.share_rnn_weights,
        place=place,
        init_from_pretrained_model=args.init_from_pretrained_model,
        output_model_dir=args.output_model_dir)

    ds2_model.train(
        train_batch_reader=train_batch_reader,
        dev_batch_reader=dev_batch_reader,
        feeding_dict=train_generator.feeding,
        learning_rate=args.learning_rate,
        gradient_clipping=400,
        batch_size=args.batch_size,
        num_samples=args.num_samples,
        num_epoch=args.num_epoch,
        save_epoch=args.save_epoch,
        num_iterations_print=args.num_iter_print,
        test_off=args.test_off)


def main():
    print_arguments(args)
    train()


if __name__ == '__main__':
    main()
Add function, class and module docs for data parts in DS2. 8 years ago			`"""Trainer for DeepSpeech2 model."""`
Refactor whole data preprocessor for DS2 (re-design classes, re-organize dir, add augmentaion interfaces etc.). 1. Refactor data preprocessor with new added class AudioSegment, SpeechSegment, TextFeaturizer, AudioFeaturizer, SpeechFeaturizer. 2. Add data augmentation interfaces and class AugmentorBase, AugmentationPipeline, VolumnPerturbAugmentor etc.. 3. Seperate normalizer's mean and std computing from training, by adding FeatureNormalizer and a seperate tool compute_mean_std.py. 4. Re-organize directory. 8 years ago			`from __future__ import absolute_import`
			`from __future__ import division`
			`from __future__ import print_function`

Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago			`import argparse`
Add back utils.py. 7 years ago			`import functools`
update deepspeech to fluid api 5 years ago			`import io`
Rename some folders and update examples. 7 years ago			`from model_utils.model import DeepSpeech2Model`
unify api to 1.6 version and fix some problems 5 years ago			`from model_utils.model_check import check_cuda, check_version`
Refactor whole data preprocessor for DS2 (re-design classes, re-organize dir, add augmentaion interfaces etc.). 1. Refactor data preprocessor with new added class AudioSegment, SpeechSegment, TextFeaturizer, AudioFeaturizer, SpeechFeaturizer. 2. Add data augmentation interfaces and class AugmentorBase, AugmentationPipeline, VolumnPerturbAugmentor etc.. 3. Seperate normalizer's mean and std computing from training, by adding FeatureNormalizer and a seperate tool compute_mean_std.py. 4. Re-organize directory. 8 years ago			`from data_utils.data import DataGenerator`
Re-organize folder structure and hierarchy for DS2. 7 years ago			`from utils.utility import add_arguments, print_arguments`
Add function docs. 8 years ago
update deepspeech to fluid api 5 years ago			`import paddle.fluid as fluid`

Re-style the config codes for tools in DS2. 7 years ago			`parser = argparse.ArgumentParser(description=__doc__)`
Add back utils.py. 7 years ago			`add_arg = functools.partial(add_arguments, argparser=parser)`
			`# yapf: disable`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`add_arg('batch_size', int, 256, "Minibatch size.")`
update deepspeech to fluid api 5 years ago			`add_arg('num_epoch', int, 200, "# of training epochs.")`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`add_arg('num_conv_layers', int, 2, "# of convolution layers.")`
			`add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")`
			`add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")`
update deepspeech to fluid api 5 years ago			`add_arg('num_iter_print', int, 100, "Every # batch for printing "`
Sort the config lines to make it look better. 7 years ago			`"train cost.")`
update deepspeech to fluid api 5 years ago			`add_arg('save_epoch', int, 10, "# Every # batch for save checkpoint and modle params ")`
			`add_arg('num_samples', int, 10000, "The num of train samples.")`
Sort the config lines to make it look better. 7 years ago			`add_arg('learning_rate', float, 5e-4, "Learning rate.")`
			`add_arg('max_duration', float, 27.0, "Longest audio duration allowed.")`
			`add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.")`
Add profile.sh script for multi-gpu profiling. 7 years ago			`add_arg('test_off', bool, False, "Turn off testing.")`
Sort the config lines to make it look better. 7 years ago			`add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.")`
			`add_arg('use_gpu', bool, True, "Use GPU or not.")`
Update argument naming following Yibing's reviews. 7 years ago			`add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")`
Re-organize folder structure and hierarchy for DS2. 7 years ago			`add_arg('is_local', bool, True, "Use pserver or not.")`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "`
update deepspeech to fluid api 5 years ago			`"bi-directional RNNs. Not for GRU.")`
unify api to 1.6 version and fix some problems 5 years ago			`add_arg('init_from_pretrained_model',str,`
update deepspeech to fluid api 5 years ago			`None,`
			`"If None, the training starts from scratch, "`
			`"otherwise, it resumes from the pre-trained model.")`

Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`add_arg('train_manifest', str,`
Re-organize folder structure and hierarchy for DS2. 7 years ago			`'data/librispeech/manifest.train',`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`"Filepath of train manifest.")`
			`add_arg('dev_manifest', str,`
Re-organize folder structure and hierarchy for DS2. 7 years ago			`'data/librispeech/manifest.dev-clean',`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`"Filepath of validation manifest.")`
			`add_arg('mean_std_path', str,`
Re-organize folder structure and hierarchy for DS2. 7 years ago			`'data/librispeech/mean_std.npz',`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`"Filepath of normalizer's mean & std.")`
			`add_arg('vocab_path', str,`
Rename some folders and update examples. 7 years ago			`'data/librispeech/vocab.txt',`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`"Filepath of vocabulary.")`
			`add_arg('output_model_dir', str,`
Update READMD.md and other details by following reviewers comments. 7 years ago			`"./checkpoints/libri",`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`"Directory for saving checkpoints.")`
Sort the config lines to make it look better. 7 years ago			`add_arg('augment_conf_path',str,`
			`'conf/augmentation.config',`
			`"Filepath of augmentation configuration file (json-format).")`
			`add_arg('specgram_type', str,`
			`'linear',`
			`"Audio feature type. Options: linear, mfcc.",`
			`choices=['linear', 'mfcc'])`
			`add_arg('shuffle_method', str,`
			`'batch_shuffle_clipped',`
			`"Shuffle method.",`
			`choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped'])`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`# yapf: disable`
Add back utils.py. 7 years ago			`args = parser.parse_args()`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago

			`def train():`
Add shuffle type of instance_shuffle and batch_shuffle_clipped. 8 years ago			`"""DeepSpeech2 training."""`
unify api to 1.6 version and fix some problems 5 years ago
			`# check if set use_gpu=True in paddlepaddle cpu version`
			`check_cuda(args.use_gpu)`
			`# check if paddlepaddle version is satisfied`
			`check_version()`

update deepspeech to fluid api 5 years ago			`if args.use_gpu:`
			`place = fluid.CUDAPlace(0)`
			`else:`
			`place = fluid.CPUPlace()`

Simplify train.py, evaluate.py, infer.py and tune.py by adding DeepSpeech2Model class. 8 years ago			`train_generator = DataGenerator(`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`vocab_filepath=args.vocab_path,`
			`mean_std_filepath=args.mean_std_path,`
update deepspeech to fluid api 5 years ago			`augmentation_config=io.open(args.augment_conf_path, mode='r', encoding='utf8').read(),`
Simplify train.py, evaluate.py, infer.py and tune.py by adding DeepSpeech2Model class. 8 years ago			`max_duration=args.max_duration,`
			`min_duration=args.min_duration,`
			`specgram_type=args.specgram_type,`
update deepspeech to fluid api 5 years ago			`place=place)`
Simplify train.py, evaluate.py, infer.py and tune.py by adding DeepSpeech2Model class. 8 years ago			`dev_generator = DataGenerator(`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`vocab_filepath=args.vocab_path,`
			`mean_std_filepath=args.mean_std_path,`
Simplify train.py, evaluate.py, infer.py and tune.py by adding DeepSpeech2Model class. 8 years ago			`augmentation_config="{}",`
			`specgram_type=args.specgram_type,`
update deepspeech to fluid api 5 years ago			`place = place)`
Support variable input batch and sortagrad. 8 years ago			`train_batch_reader = train_generator.batch_reader_creator(`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`manifest_path=args.train_manifest,`
Refactor decoder interfaces and add ./data directory. 8 years ago			`batch_size=args.batch_size,`
unify api to 1.6 version and fix some problems 5 years ago			`sortagrad=args.use_sortagrad if args.init_from_pretrained_model is None else False,`
Add shuffle type of instance_shuffle and batch_shuffle_clipped. 8 years ago			`shuffle_method=args.shuffle_method)`
Simplify train.py, evaluate.py, infer.py and tune.py by adding DeepSpeech2Model class. 8 years ago			`dev_batch_reader = dev_generator.batch_reader_creator(`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`manifest_path=args.dev_manifest,`
Refactor decoder interfaces and add ./data directory. 8 years ago			`batch_size=args.batch_size,`
Refactor whole data preprocessor for DS2 (re-design classes, re-organize dir, add augmentaion interfaces etc.). 1. Refactor data preprocessor with new added class AudioSegment, SpeechSegment, TextFeaturizer, AudioFeaturizer, SpeechFeaturizer. 2. Add data augmentation interfaces and class AugmentorBase, AugmentationPipeline, VolumnPerturbAugmentor etc.. 3. Seperate normalizer's mean and std computing from training, by adding FeatureNormalizer and a seperate tool compute_mean_std.py. 4. Re-organize directory. 8 years ago			`sortagrad=False,`
Add shuffle type of instance_shuffle and batch_shuffle_clipped. 8 years ago			`shuffle_method=None)`
1. Fix incorrect decoder result printing. 2. Fix incorrect batch-norm usage in RNN. 3. Fix overlapping train/dev/test manfests. 4. Update README.md and requirements.txt. 5. Expose more arguments to users in argparser. 6. Update all other details. 8 years ago
Simplify train.py, evaluate.py, infer.py and tune.py by adding DeepSpeech2Model class. 8 years ago			`ds2_model = DeepSpeech2Model(`
			`vocab_size=train_generator.vocab_size,`
			`num_conv_layers=args.num_conv_layers,`
			`num_rnn_layers=args.num_rnn_layers,`
			`rnn_layer_size=args.rnn_layer_size,`
Add GRU support. 8 years ago			`use_gru=args.use_gru,`
update deepspeech to fluid api 5 years ago			`share_rnn_weights=args.share_rnn_weights,`
			`place=place,`
unify api to 1.6 version and fix some problems 5 years ago			`init_from_pretrained_model=args.init_from_pretrained_model,`
update deepspeech to fluid api 5 years ago			`output_model_dir=args.output_model_dir)`

Simplify train.py, evaluate.py, infer.py and tune.py by adding DeepSpeech2Model class. 8 years ago			`ds2_model.train(`
			`train_batch_reader=train_batch_reader,`
			`dev_batch_reader=dev_batch_reader,`
			`feeding_dict=train_generator.feeding,`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`learning_rate=args.learning_rate,`
Simplify train.py, evaluate.py, infer.py and tune.py by adding DeepSpeech2Model class. 8 years ago			`gradient_clipping=400,`
update deepspeech to fluid api 5 years ago			`batch_size=args.batch_size,`
			`num_samples=args.num_samples,`
			`num_epoch=args.num_epoch,`
			`save_epoch=args.save_epoch,`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`num_iterations_print=args.num_iter_print,`
Add profile.sh script for multi-gpu profiling. 7 years ago			`test_off=args.test_off)`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago

			`def main():`
Reduce the config parsing codes for DS2 and make it looks cleaner. 7 years ago			`print_arguments(args)`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago			`train()`


			`if __name__ == '__main__':`
			`main()`