diff --git a/demo_server.py b/demo_server.py index d2afa49b..5eed3d2e 100644 --- a/demo_server.py +++ b/demo_server.py @@ -27,41 +27,25 @@ def add_arg(argname, type, default, help, **kwargs): # yapf: disable -# configurations of overall add_arg('host_port', int, 8086, "Server's IP port.") -add_arg('host_ip', str, - 'localhost', - "Server's IP address.") -add_arg('speech_save_dir', str, - 'demo_cache', - "Directory to save demo audios.") -add_arg('use_gpu', bool, True, "Use GPU or not.") -# configurations of decoder add_arg('beam_size', int, 500, "Beam search width.") -add_arg('alpha', float, 0.36, "Coef of LM for beam search.") -add_arg('beta', float, 0.25, "Coef of WC for beam search.") -add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") -add_arg('lang_model_path', str, - 'lm/data/common_crawl_00.prune01111.trie.klm', - "Filepath for language model.") -add_arg('decoder_method', str, - 'ctc_beam_search', - "Decoder method. Options: ctc_beam_search, ctc_greedy", - choices = ['ctc_beam_search', 'ctc_greedy']) -# configurations of data preprocess -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -# configurations of model structure add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") -# configurations of data io -add_arg('warmup_manifest', str, +add_arg('host_ip', str, + 'localhost', + "Server's IP address.") +add_arg('speech_save_dir', str, + 'demo_cache', + "Directory to save demo audios.") +add_arg('warmup_manifest', str, 'datasets/manifest.test', "Filepath of manifest to warm up.") add_arg('mean_std_path', str, @@ -70,11 +54,21 @@ add_arg('mean_std_path', str, add_arg('vocab_path', str, 'datasets/vocab/eng_vocab.txt', "Filepath of vocabulary.") -# configurations of model io add_arg('model_path', str, './checkpoints/params.latest.tar.gz', "If None, the training starts from scratch, " "otherwise, it resumes from the pre-trained model.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('decoder_method', str, + 'ctc_beam_search', + "Decoder method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) args = parser.parse_args() # yapf: disable diff --git a/evaluate.py b/evaluate.py index 1adf4255..2c412778 100644 --- a/evaluate.py +++ b/evaluate.py @@ -26,39 +26,21 @@ def add_arg(argname, type, default, help, **kwargs): # yapf: disable -# configurations of overall add_arg('batch_size', int, 128, "Minibatch size.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") -add_arg('use_gpu', bool, True, "Use GPU or not.") -add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.", - choices=['wer', 'cer']) -# configurations of decoder add_arg('beam_size', int, 500, "Beam search width.") -add_arg('alpha', float, 0.36, "Coef of LM for beam search.") -add_arg('beta', float, 0.25, "Coef of WC for beam search.") -add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") -add_arg('lang_model_path', str, - 'lm/data/common_crawl_00.prune01111.trie.klm', - "Filepath for language model.") -add_arg('decoder_method', str, - 'ctc_beam_search', - "Decoder method. Options: ctc_beam_search, ctc_greedy", - choices = ['ctc_beam_search', 'ctc_greedy']) -# configurations of data preprocess add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.") -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -# configurations of model structure add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") -# configurations of data io add_arg('test_manifest', str, 'datasets/manifest.test', "Filepath of manifest to evaluate.") @@ -68,11 +50,25 @@ add_arg('mean_std_path', str, add_arg('vocab_path', str, 'datasets/vocab/eng_vocab.txt', "Filepath of vocabulary.") -# configurations of model io add_arg('model_path', str, './checkpoints/params.latest.tar.gz', "If None, the training starts from scratch, " "otherwise, it resumes from the pre-trained model.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('decoder_method', str, + 'ctc_beam_search', + "Decoder method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +add_arg('error_rate_type', str, + 'wer', + "Error rate type for evaluation.", + choices=['wer', 'cer']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) args = parser.parse_args() # yapf: disable diff --git a/infer.py b/infer.py index cf02808c..313f80c0 100644 --- a/infer.py +++ b/infer.py @@ -29,35 +29,18 @@ def add_arg(argname, type, default, help, **kwargs): # configurations of overall add_arg('num_samples', int, 10, "# of samples to infer.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") -add_arg('use_gpu', bool, True, "Use GPU or not.") -add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.", - choices=['wer', 'cer']) -# configurations of decoder add_arg('beam_size', int, 500, "Beam search width.") -add_arg('alpha', float, 0.36, "Coef of LM for beam search.") -add_arg('beta', float, 0.25, "Coef of WC for beam search.") -add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") -add_arg('lang_model_path', str, - 'lm/data/common_crawl_00.prune01111.trie.klm', - "Filepath for language model.") -add_arg('decoder_method', str, - 'ctc_beam_search', - "Decoder method. Options: ctc_beam_search, ctc_greedy", - choices = ['ctc_beam_search', 'ctc_greedy']) -# configurations of data preprocess -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -# configurations of model structure add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") -# configurations of data io add_arg('infer_manifest', str, 'datasets/manifest.dev', "Filepath of manifest to infer.") @@ -67,11 +50,25 @@ add_arg('mean_std_path', str, add_arg('vocab_path', str, 'datasets/vocab/eng_vocab.txt', "Filepath of vocabulary.") -# configurations of model io +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") add_arg('model_path', str, './checkpoints/params.latest.tar.gz', "If None, the training starts from scratch, " "otherwise, it resumes from the pre-trained model.") +add_arg('decoder_method', str, + 'ctc_beam_search', + "Decoder method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +add_arg('error_rate_type', str, + 'wer', + "Error rate type for evaluation.", + choices=['wer', 'cer']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) args = parser.parse_args() # yapf: disable diff --git a/train.py b/train.py index d21e6a3b..3d658d27 100644 --- a/train.py +++ b/train.py @@ -25,39 +25,24 @@ def add_arg(argname, type, default, help, **kwargs): # yapf: disable -# configurations of optimization add_arg('batch_size', int, 256, "Minibatch size.") -add_arg('learning_rate', float, 5e-4, "Learning rate.") -add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") -add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('num_passes', int, 200, "# of training epochs.") -add_arg('is_local', bool, True, "Use pserver or not.") -add_arg('num_iter_print', int, 100, "Every # iterations for printing " - "train cost.") -# configurations of data preprocess -add_arg('max_duration', float, 27.0, "Longest audio duration allowed.") -add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.") add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.") -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -add_arg('augment_conf_path',str, - 'conf/augmentation.config', - "Filepath of augmentation configuration file (json-format).") -add_arg('shuffle_method', str, - 'batch_shuffle_clipped', - "Shuffle method.", - choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped']) -# configurations of model structure add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('num_iter_print', int, 100, "Every # iterations for printing " + "train cost.") +add_arg('learning_rate', float, 5e-4, "Learning rate.") +add_arg('max_duration', float, 27.0, "Longest audio duration allowed.") +add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.") +add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('is_local', bool, True, "Use pserver or not.") add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") -# configurations of data io add_arg('train_manifest', str, 'datasets/manifest.train', "Filepath of train manifest.") @@ -70,7 +55,6 @@ add_arg('mean_std_path', str, add_arg('vocab_path', str, 'datasets/vocab/eng_vocab.txt', "Filepath of vocabulary.") -# configurations of model io add_arg('init_model_path', str, None, "If None, the training starts from scratch, " @@ -78,6 +62,17 @@ add_arg('init_model_path', str, add_arg('output_model_dir', str, "./checkpoints", "Directory for saving checkpoints.") +add_arg('augment_conf_path',str, + 'conf/augmentation.config', + "Filepath of augmentation configuration file (json-format).") +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +add_arg('shuffle_method', str, + 'batch_shuffle_clipped', + "Shuffle method.", + choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped']) args = parser.parse_args() # yapf: disable diff --git a/tune.py b/tune.py index eac7ccd3..2fbe0b98 100644 --- a/tune.py +++ b/tune.py @@ -27,40 +27,25 @@ def add_arg(argname, type, default, help, **kwargs): # yapf: disable -# configurations of overall add_arg('num_samples', int, 100, "# of samples to infer.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") -add_arg('use_gpu', bool, True, "Use GPU or not.") -add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.", - choices=['wer', 'cer']) -# configurations of tuning parameters -add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.") -add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.") -add_arg('num_alphas', int, 14, "# of alpha candidates for tuning.") -add_arg('beta_from', float, 0.05, "Where beta starts tuning from.") -add_arg('beta_to', float, 0.36, "Where beta ends tuning with.") -add_arg('num_betas', int, 20, "# of beta candidates for tuning.") -# configurations of decoder add_arg('beam_size', int, 500, "Beam search width.") -add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") -add_arg('lang_model_path', str, - 'lm/data/common_crawl_00.prune01111.trie.klm', - "Filepath for language model.") -# configurations of data preprocess -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -# configurations of model structure add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('num_alphas', int, 14, "# of alpha candidates for tuning.") +add_arg('num_betas', int, 20, "# of beta candidates for tuning.") +add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.") +add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.") +add_arg('beta_from', float, 0.05, "Where beta starts tuning from.") +add_arg('beta_to', float, 0.36, "Where beta ends tuning with.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") -# configurations of data io -add_arg('tune_manifest', str, +add_arg('tune_manifest', str, 'datasets/manifest.test', "Filepath of manifest to tune.") add_arg('mean_std_path', str, @@ -69,11 +54,21 @@ add_arg('mean_std_path', str, add_arg('vocab_path', str, 'datasets/vocab/eng_vocab.txt', "Filepath of vocabulary.") -# configurations of model io +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") add_arg('model_path', str, './checkpoints/params.latest.tar.gz', "If None, the training starts from scratch, " "otherwise, it resumes from the pre-trained model.") +add_arg('error_rate_type', str, + 'wer', + "Error rate type for evaluation.", + choices=['wer', 'cer']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) args = parser.parse_args() # yapf: disable