Reduce the config parsing codes for DS2 and make it looks cleaner.

pull/2/head
Xinghai Sun 7 years ago
parent 2aa4af1c29
commit 805846ce67

@ -9,8 +9,9 @@ from math import log
import multiprocessing import multiprocessing
def ctc_best_path_decoder(probs_seq, vocabulary): def ctc_greedy_decoder(probs_seq, vocabulary):
"""Best path decoder, also called argmax decoder or greedy decoder. """CTC greedy (best path) decoder.
Path consisting of the most probable tokens are further post-processed to Path consisting of the most probable tokens are further post-processed to
remove consecutive repetitions and all blanks. remove consecutive repetitions and all blanks.
@ -45,10 +46,12 @@ def ctc_beam_search_decoder(probs_seq,
cutoff_prob=1.0, cutoff_prob=1.0,
ext_scoring_func=None, ext_scoring_func=None,
nproc=False): nproc=False):
"""Beam search decoder for CTC-trained network. It utilizes beam search """CTC Beam search decoder.
to approximately select top best decoding labels and returning results
in the descending order. The implementation is based on Prefix It utilizes beam search to approximately select top best decoding
Beam Search (https://arxiv.org/abs/1408.2873), and the unclear part is labels and returning results in the descending order.
The implementation is based on Prefix Beam Search
(https://arxiv.org/abs/1408.2873), and the unclear part is
redesigned. Two important modifications: 1) in the iterative computation redesigned. Two important modifications: 1) in the iterative computation
of probabilities, the assignment operation is changed to accumulation for of probabilities, the assignment operation is changed to accumulation for
one prefix may comes from different paths; 2) the if condition "if l^+ not one prefix may comes from different paths; 2) the if condition "if l^+ not

@ -9,118 +9,74 @@ import SocketServer
import struct import struct
import wave import wave
import paddle.v2 as paddle import paddle.v2 as paddle
from utils import print_arguments
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import DeepSpeech2Model from model import DeepSpeech2Model
from data_utils.utils import read_manifest from data_utils.utils import read_manifest
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--host_ip",
default="localhost", def add_arg(argname, type, default, help, **kwargs):
type=str, type = distutils.util.strtobool if type == bool else type
help="Server IP address. (default: %(default)s)") parser.add_argument(
parser.add_argument( "--" + argname,
"--host_port", default=default,
default=8086, type=type,
type=int, help=help + ' Default: %(default)s.',
help="Server Port. (default: %(default)s)") **kwargs)
parser.add_argument(
"--speech_save_dir",
default="demo_cache", # yapf: disable
type=str, # configurations of overall
help="Directory for saving demo speech. (default: %(default)s)") add_arg('host_port', int, 8086, "Server's IP port.")
parser.add_argument( add_arg('host_ip', str,
"--vocab_filepath", 'localhost',
default='datasets/vocab/eng_vocab.txt', "Server's IP address.")
type=str, add_arg('speech_save_dir', str,
help="Vocabulary filepath. (default: %(default)s)") 'demo_cache',
parser.add_argument( "Directory to save demo audios.")
"--mean_std_filepath", add_arg('use_gpu', bool, True, "Use GPU or not.")
default='mean_std.npz', # configurations of decoder
type=str, add_arg('beam_size', int, 500, "Beam search width.")
help="Manifest path for normalizer. (default: %(default)s)") add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
parser.add_argument( add_arg('beta', float, 0.25, "Coef of WC for beam search.")
"--warmup_manifest_path", add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
default='datasets/manifest.test', add_arg('lang_model_path', str,
type=str, 'lm/data/common_crawl_00.prune01111.trie.klm',
help="Manifest path for warmup test. (default: %(default)s)") "Filepath for language model.")
parser.add_argument( add_arg('decoder_method', str,
"--specgram_type", 'ctc_beam_search',
default='linear', "Decoder method. Options: ctc_beam_search, ctc_greedy",
type=str, choices = ['ctc_beam_search', 'ctc_greedy'])
help="Feature type of audio data: 'linear' (power spectrum)" # configurations of data preprocess
" or 'mfcc'. (default: %(default)s)") add_arg('specgram_type', str,
parser.add_argument( 'linear',
"--num_conv_layers", "Audio feature type. Options: linear, mfcc.",
default=2, choices=['linear', 'mfcc'])
type=int, # configurations of model structure
help="Convolution layer number. (default: %(default)s)") add_arg('num_conv_layers', int, 2, "# of convolution layers.")
parser.add_argument( add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
"--num_rnn_layers", add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
default=3, add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.")
type=int, add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
help="RNN layer number. (default: %(default)s)") "bi-directional RNNs. Not for GRU.")
parser.add_argument( # configurations of data io
"--rnn_layer_size", add_arg('warmup_manifest', str,
default=2048, 'datasets/manifest.test',
type=int, "Filepath of manifest to warm up.")
help="RNN layer cell number. (default: %(default)s)") add_arg('mean_std_path', str,
parser.add_argument( 'mean_std.npz',
"--share_rnn_weights", "Filepath of normalizer's mean & std.")
default=True, add_arg('vocab_path', str,
type=distutils.util.strtobool, 'datasets/vocab/eng_vocab.txt',
help="Whether to share input-hidden weights between forword and backward " "Filepath of vocabulary.")
"directional simple RNNs. Only available when use_gru=False. " # configurations of model io
"(default: %(default)s)") add_arg('model_path', str,
parser.add_argument( './checkpoints/params.latest.tar.gz',
"--use_gru", "If None, the training starts from scratch, "
default=False, "otherwise, it resumes from the pre-trained model.")
type=distutils.util.strtobool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--decode_method",
default='beam_search',
type=str,
help="Method for ctc decoding: best_path or beam_search. "
"(default: %(default)s)")
parser.add_argument(
"--beam_size",
default=100,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha",
default=0.36,
type=float,
help="Parameter associated with language model. (default: %(default)f)")
parser.add_argument(
"--beta",
default=0.25,
type=float,
help="Parameter associated with word count. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
args = parser.parse_args() args = parser.parse_args()
# yapf: disable
class AsrTCPServer(SocketServer.TCPServer): class AsrTCPServer(SocketServer.TCPServer):
@ -200,8 +156,8 @@ def start_server():
"""Start the ASR server""" """Start the ASR server"""
# prepare data generator # prepare data generator
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=1) num_threads=1)
@ -212,7 +168,7 @@ def start_server():
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.model_filepath, pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights) share_rnn_weights=args.share_rnn_weights)
# prepare ASR inference handler # prepare ASR inference handler
@ -220,13 +176,13 @@ def start_server():
feature = data_generator.process_utterance(filename, "") feature = data_generator.process_utterance(filename, "")
result_transcript = ds2_model.infer_batch( result_transcript = ds2_model.infer_batch(
infer_data=[feature], infer_data=[feature],
decode_method=args.decode_method, decoder_method=args.decoder_method,
beam_alpha=args.alpha, beam_alpha=args.alpha,
beam_beta=args.beta, beam_beta=args.beta,
beam_size=args.beam_size, beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob, cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list, vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path, language_model_path=args.lang_model_path,
num_processes=1) num_processes=1)
return result_transcript[0] return result_transcript[0]
@ -235,7 +191,7 @@ def start_server():
print('Warming up ...') print('Warming up ...')
warm_up_test( warm_up_test(
audio_process_handler=file_to_transcript, audio_process_handler=file_to_transcript,
manifest_path=args.warmup_manifest_path, manifest_path=args.warmup_manifest,
num_test_cases=3) num_test_cases=3)
print('-----------------------------------------------------------') print('-----------------------------------------------------------')
@ -249,6 +205,13 @@ def start_server():
server.serve_forever() server.serve_forever()
def print_arguments(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def main(): def main():
print_arguments(args) print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=1) paddle.init(use_gpu=args.use_gpu, trainer_count=1)

@ -10,140 +10,83 @@ import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import DeepSpeech2Model from model import DeepSpeech2Model
from error_rate import wer, cer from error_rate import wer, cer
import utils
NUM_CPU = multiprocessing.cpu_count() // 2
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--batch_size",
default=128, def add_arg(argname, type, default, help, **kwargs):
type=int, type = distutils.util.strtobool if type == bool else type
help="Minibatch size for evaluation. (default: %(default)s)") parser.add_argument(
parser.add_argument( "--" + argname,
"--trainer_count", default=default,
default=8, type=type,
type=int, help=help + ' Default: %(default)s.',
help="Trainer number. (default: %(default)s)") **kwargs)
parser.add_argument(
"--num_conv_layers",
default=2, # yapf: disable
type=int, # configurations of overall
help="Convolution layer number. (default: %(default)s)") add_arg('batch_size', int, 128, "Minibatch size.")
parser.add_argument( add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
"--num_rnn_layers", add_arg('use_gpu', bool, True, "Use GPU or not.")
default=3, add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.",
type=int, choices=['wer', 'cer'])
help="RNN layer number. (default: %(default)s)") # configurations of decoder
parser.add_argument( add_arg('beam_size', int, 500, "Beam search width.")
"--rnn_layer_size", add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
default=2048, add_arg('beta', float, 0.25, "Coef of WC for beam search.")
type=int, add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
help="RNN layer cell number. (default: %(default)s)") add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.")
parser.add_argument( add_arg('lang_model_path', str,
"--share_rnn_weights", 'lm/data/common_crawl_00.prune01111.trie.klm',
default=True, "Filepath for language model.")
type=distutils.util.strtobool, add_arg('decoder_method', str,
help="Whether to share input-hidden weights between forword and backward " 'ctc_beam_search',
"directional simple RNNs. Only available when use_gru=False. " "Decoder method. Options: ctc_beam_search, ctc_greedy",
"(default: %(default)s)") choices = ['ctc_beam_search', 'ctc_greedy'])
parser.add_argument( # configurations of data preprocess
"--use_gru", add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.")
default=False, add_arg('specgram_type', str,
type=distutils.util.strtobool, 'linear',
help="Use GRU or simple RNN. (default: %(default)s)") "Audio feature type. Options: linear, mfcc.",
parser.add_argument( choices=['linear', 'mfcc'])
"--use_gpu", # configurations of model structure
default=True, add_arg('num_conv_layers', int, 2, "# of convolution layers.")
type=distutils.util.strtobool, add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
help="Use gpu or not. (default: %(default)s)") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
parser.add_argument( add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.")
"--num_threads_data", add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
default=multiprocessing.cpu_count() // 2, "bi-directional RNNs. Not for GRU.")
type=int, # configurations of data io
help="Number of cpu threads for preprocessing data. (default: %(default)s)") add_arg('test_manifest', str,
parser.add_argument( 'datasets/manifest.test',
"--num_processes_beam_search", "Filepath of manifest to evaluate.")
default=multiprocessing.cpu_count() // 2, add_arg('mean_std_path', str,
type=int, 'mean_std.npz',
help="Number of cpu processes for beam search. (default: %(default)s)") "Filepath of normalizer's mean & std.")
parser.add_argument( add_arg('vocab_path', str,
"--mean_std_filepath", 'datasets/vocab/eng_vocab.txt',
default='mean_std.npz', "Filepath of vocabulary.")
type=str, # configurations of model io
help="Manifest path for normalizer. (default: %(default)s)") add_arg('model_path', str,
parser.add_argument( './checkpoints/params.latest.tar.gz',
"--decode_method", "If None, the training starts from scratch, "
default='beam_search', "otherwise, it resumes from the pre-trained model.")
type=str,
help="Method for ctc decoding, best_path or beam_search. "
"(default: %(default)s)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha",
default=0.36,
type=float,
help="Parameter associated with language model. (default: %(default)f)")
parser.add_argument(
"--beta",
default=0.25,
type=float,
help="Parameter associated with word count. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
parser.add_argument(
"--beam_size",
default=500,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--specgram_type",
default='linear',
type=str,
help="Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)")
parser.add_argument(
"--decode_manifest_path",
default='datasets/manifest.test',
type=str,
help="Manifest path for decoding. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--error_rate_type",
default='wer',
choices=['wer', 'cer'],
type=str,
help="Error rate type for evaluation. 'wer' for word error rate and 'cer' "
"for character error rate. "
"(default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
# yapf: disable
def evaluate(): def evaluate():
"""Evaluate on whole test data for DeepSpeech2.""" """Evaluate on whole test data for DeepSpeech2."""
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.parallels_data)
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path, manifest_path=args.test_manifest,
batch_size=args.batch_size, batch_size=args.batch_size,
min_batch_size=1, min_batch_size=1,
sortagrad=False, sortagrad=False,
@ -155,7 +98,7 @@ def evaluate():
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.model_filepath, pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights) share_rnn_weights=args.share_rnn_weights)
error_rate_func = cer if args.error_rate_type == 'cer' else wer error_rate_func = cer if args.error_rate_type == 'cer' else wer
@ -163,14 +106,14 @@ def evaluate():
for infer_data in batch_reader(): for infer_data in batch_reader():
result_transcripts = ds2_model.infer_batch( result_transcripts = ds2_model.infer_batch(
infer_data=infer_data, infer_data=infer_data,
decode_method=args.decode_method, decoder_method=args.decoder_method,
beam_alpha=args.alpha, beam_alpha=args.alpha,
beam_beta=args.beta, beam_beta=args.beta,
beam_size=args.beam_size, beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob, cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list, vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path, language_model_path=args.lang_model_path,
num_processes=args.num_processes_beam_search) num_processes=args.parallels_bsearch)
target_transcripts = [ target_transcripts = [
''.join([data_generator.vocab_list[token] for token in transcript]) ''.join([data_generator.vocab_list[token] for token in transcript])
for _, transcript in infer_data for _, transcript in infer_data
@ -184,8 +127,15 @@ def evaluate():
(args.error_rate_type, num_ins, num_ins, error_sum / num_ins)) (args.error_rate_type, num_ins, num_ins, error_sum / num_ins))
def print_arguments(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def main(): def main():
utils.print_arguments(args) print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
evaluate() evaluate()

@ -10,140 +10,82 @@ import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import DeepSpeech2Model from model import DeepSpeech2Model
from error_rate import wer, cer from error_rate import wer, cer
import utils
NUM_CPU = multiprocessing.cpu_count() // 2
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--num_samples",
default=10, def add_arg(argname, type, default, help, **kwargs):
type=int, type = distutils.util.strtobool if type == bool else type
help="Number of samples for inference. (default: %(default)s)") parser.add_argument(
parser.add_argument( "--" + argname,
"--num_conv_layers", default=default,
default=2, type=type,
type=int, help=help + ' Default: %(default)s.',
help="Convolution layer number. (default: %(default)s)") **kwargs)
parser.add_argument(
"--num_rnn_layers",
default=3, # yapf: disable
type=int, # configurations of overall
help="RNN layer number. (default: %(default)s)") add_arg('num_samples', int, 10, "# of samples to infer.")
parser.add_argument( add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
"--rnn_layer_size", add_arg('use_gpu', bool, True, "Use GPU or not.")
default=2048, add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.",
type=int, choices=['wer', 'cer'])
help="RNN layer cell number. (default: %(default)s)") # configurations of decoder
parser.add_argument( add_arg('beam_size', int, 500, "Beam search width.")
"--share_rnn_weights", add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
default=True, add_arg('beta', float, 0.25, "Coef of WC for beam search.")
type=distutils.util.strtobool, add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
help="Whether to share input-hidden weights between forword and backward " add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.")
"directional simple RNNs. Only available when use_gru=False. " add_arg('lang_model_path', str,
"(default: %(default)s)") 'lm/data/common_crawl_00.prune01111.trie.klm',
parser.add_argument( "Filepath for language model.")
"--use_gru", add_arg('decoder_method', str,
default=False, 'ctc_beam_search',
type=distutils.util.strtobool, "Decoder method. Options: ctc_beam_search, ctc_greedy",
help="Use GRU or simple RNN. (default: %(default)s)") choices = ['ctc_beam_search', 'ctc_greedy'])
parser.add_argument( # configurations of data preprocess
"--use_gpu", add_arg('specgram_type', str,
default=True, 'linear',
type=distutils.util.strtobool, "Audio feature type. Options: linear, mfcc.",
help="Use gpu or not. (default: %(default)s)") choices=['linear', 'mfcc'])
parser.add_argument( # configurations of model structure
"--num_threads_data", add_arg('num_conv_layers', int, 2, "# of convolution layers.")
default=1, add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
type=int, add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
help="Number of cpu threads for preprocessing data. (default: %(default)s)") add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.")
parser.add_argument( add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"--num_processes_beam_search", "bi-directional RNNs. Not for GRU.")
default=multiprocessing.cpu_count() // 2, # configurations of data io
type=int, add_arg('infer_manifest', str,
help="Number of cpu processes for beam search. (default: %(default)s)") 'datasets/manifest.dev',
parser.add_argument( "Filepath of manifest to infer.")
"--specgram_type", add_arg('mean_std_path', str,
default='linear', 'mean_std.npz',
type=str, "Filepath of normalizer's mean & std.")
help="Feature type of audio data: 'linear' (power spectrum)" add_arg('vocab_path', str,
" or 'mfcc'. (default: %(default)s)") 'datasets/vocab/eng_vocab.txt',
parser.add_argument( "Filepath of vocabulary.")
"--trainer_count", # configurations of model io
default=8, add_arg('model_path', str,
type=int, './checkpoints/params.latest.tar.gz',
help="Trainer number. (default: %(default)s)") "If None, the training starts from scratch, "
parser.add_argument( "otherwise, it resumes from the pre-trained model.")
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--decode_manifest_path",
default='datasets/manifest.test',
type=str,
help="Manifest path for decoding. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--decode_method",
default='beam_search',
type=str,
help="Method for ctc decoding: best_path or beam_search. "
"(default: %(default)s)")
parser.add_argument(
"--beam_size",
default=500,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha",
default=0.36,
type=float,
help="Parameter associated with language model. (default: %(default)f)")
parser.add_argument(
"--beta",
default=0.25,
type=float,
help="Parameter associated with word count. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
parser.add_argument(
"--error_rate_type",
default='wer',
choices=['wer', 'cer'],
type=str,
help="Error rate type for evaluation. 'wer' for word error rate and 'cer' "
"for character error rate. "
"(default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
# yapf: disable
def infer(): def infer():
"""Inference for DeepSpeech2.""" """Inference for DeepSpeech2."""
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=1)
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path, manifest_path=args.infer_manifest,
batch_size=args.num_samples, batch_size=args.num_samples,
min_batch_size=1, min_batch_size=1,
sortagrad=False, sortagrad=False,
@ -156,18 +98,18 @@ def infer():
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.model_filepath, pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights) share_rnn_weights=args.share_rnn_weights)
result_transcripts = ds2_model.infer_batch( result_transcripts = ds2_model.infer_batch(
infer_data=infer_data, infer_data=infer_data,
decode_method=args.decode_method, decoder_method=args.decoder_method,
beam_alpha=args.alpha, beam_alpha=args.alpha,
beam_beta=args.beta, beam_beta=args.beta,
beam_size=args.beam_size, beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob, cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list, vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path, language_model_path=args.lang_model_path,
num_processes=args.num_processes_beam_search) num_processes=args.parallels_bsearch)
error_rate_func = cer if args.error_rate_type == 'cer' else wer error_rate_func = cer if args.error_rate_type == 'cer' else wer
target_transcripts = [ target_transcripts = [
@ -181,8 +123,15 @@ def infer():
(args.error_rate_type, error_rate_func(target, result))) (args.error_rate_type, error_rate_func(target, result)))
def print_arguments(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def main(): def main():
utils.print_arguments(args) print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
infer() infer()

@ -146,7 +146,7 @@ class DeepSpeech2Model(object):
# run inference # run inference
return self._loss_inferer.infer(input=infer_data) return self._loss_inferer.infer(input=infer_data)
def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta, def infer_batch(self, infer_data, decoder_method, beam_alpha, beam_beta,
beam_size, cutoff_prob, vocab_list, language_model_path, beam_size, cutoff_prob, vocab_list, language_model_path,
num_processes): num_processes):
"""Model inference. Infer the transcription for a batch of speech """Model inference. Infer the transcription for a batch of speech
@ -156,9 +156,9 @@ class DeepSpeech2Model(object):
consisting of a tuple of audio features and consisting of a tuple of audio features and
transcription text (empty string). transcription text (empty string).
:type infer_data: list :type infer_data: list
:param decode_method: Decoding method name, 'best_path' or :param decoder_method: Decoding method name, 'ctc_greedy' or
'beam search'. 'ctc_beam_search'.
:param decode_method: string :param decoder_method: string
:param beam_alpha: Parameter associated with language model. :param beam_alpha: Parameter associated with language model.
:type beam_alpha: float :type beam_alpha: float
:param beam_beta: Parameter associated with word count. :param beam_beta: Parameter associated with word count.
@ -190,13 +190,13 @@ class DeepSpeech2Model(object):
] ]
# run decoder # run decoder
results = [] results = []
if decode_method == "best_path": if decoder_method == "ctc_greedy":
# best path decode # best path decode
for i, probs in enumerate(probs_split): for i, probs in enumerate(probs_split):
output_transcription = ctc_best_path_decoder( output_transcription = ctc_greedy_decoder(
probs_seq=probs, vocabulary=vocab_list) probs_seq=probs, vocabulary=vocab_list)
results.append(output_transcription) results.append(output_transcription)
elif decode_method == "beam_search": elif decoder_method == "ctc_beam_search":
# initialize external scorer # initialize external scorer
if self._ext_scorer == None: if self._ext_scorer == None:
self._ext_scorer = LmScorer(beam_alpha, beam_beta, self._ext_scorer = LmScorer(beam_alpha, beam_beta,
@ -205,7 +205,6 @@ class DeepSpeech2Model(object):
else: else:
self._ext_scorer.reset_params(beam_alpha, beam_beta) self._ext_scorer.reset_params(beam_alpha, beam_beta)
assert self._loaded_lm_path == language_model_path assert self._loaded_lm_path == language_model_path
# beam search decode # beam search decode
beam_search_results = ctc_beam_search_decoder_batch( beam_search_results = ctc_beam_search_decoder_batch(
probs_split=probs_split, probs_split=probs_split,
@ -218,8 +217,8 @@ class DeepSpeech2Model(object):
results = [result[0][1] for result in beam_search_results] results = [result[0][1] for result in beam_search_results]
else: else:
raise ValueError("Decoding method [%s] is not supported." % raise ValueError("Decoder method [%s] is not supported." %
decode_method) decoder_method)
return results return results
def _create_parameters(self, model_path=None): def _create_parameters(self, model_path=None):

@ -9,169 +9,103 @@ import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from model import DeepSpeech2Model from model import DeepSpeech2Model
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
import utils
NUM_CPU = multiprocessing.cpu_count() // 2
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--batch_size", default=256, type=int, help="Minibatch size.")
parser.add_argument( def add_arg(argname, type, default, help, **kwargs):
"--num_passes", type = distutils.util.strtobool if type == bool else type
default=200, parser.add_argument(
type=int, "--" + argname,
help="Training pass number. (default: %(default)s)") default=default,
parser.add_argument( type=type,
"--num_iterations_print", help=help + ' Default: %(default)s.',
default=100, **kwargs)
type=int,
help="Number of iterations for every train cost printing. "
"(default: %(default)s)") # yapf: disable
parser.add_argument( # configurations of optimization
"--num_conv_layers", add_arg('batch_size', int, 256, "Minibatch size.")
default=2, add_arg('learning_rate', float, 5e-4, "Learning rate.")
type=int, add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.")
help="Convolution layer number. (default: %(default)s)") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
parser.add_argument( add_arg('use_gpu', bool, True, "Use GPU or not.")
"--num_rnn_layers", add_arg('num_passes', int, 200, "# of training epochs.")
default=3, add_arg('is_local', bool, True, "Use pserver or not.")
type=int, add_arg('num_iter_print', int, 100, "Every # iterations for printing "
help="RNN layer number. (default: %(default)s)") "train cost.")
parser.add_argument( # configurations of data preprocess
"--rnn_layer_size", add_arg('max_duration', float, 27.0, "Longest audio duration allowed.")
default=2048, add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.")
type=int, add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.")
help="RNN layer cell number. (default: %(default)s)") add_arg('specgram_type', str,
parser.add_argument( 'linear',
"--share_rnn_weights", "Audio feature type. Options: linear, mfcc.",
default=True, choices=['linear', 'mfcc'])
type=distutils.util.strtobool, add_arg('augment_conf_path',str,
help="Whether to share input-hidden weights between forword and backward " 'conf/augmentation.config',
"directional simple RNNs. Only available when use_gru=False. " "Filepath of augmentation configuration file (json-format).")
"(default: %(default)s)") add_arg('shuffle_method', str,
parser.add_argument( 'batch_shuffle_clipped',
"--use_gru", "Shuffle method.",
default=False, choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped'])
type=distutils.util.strtobool, # configurations of model structure
help="Use GRU or simple RNN. (default: %(default)s)") add_arg('num_conv_layers', int, 2, "# of convolution layers.")
parser.add_argument( add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
"--adam_learning_rate", add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
default=5e-4, add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.")
type=float, add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
help="Learning rate for ADAM Optimizer. (default: %(default)s)") "bi-directional RNNs. Not for GRU.")
parser.add_argument( # configurations of data io
"--use_gpu", add_arg('train_manifest', str,
default=True, 'datasets/manifest.train',
type=distutils.util.strtobool, "Filepath of train manifest.")
help="Use gpu or not. (default: %(default)s)") add_arg('dev_manifest', str,
parser.add_argument( 'datasets/manifest.dev',
"--use_sortagrad", "Filepath of validation manifest.")
default=True, add_arg('mean_std_path', str,
type=distutils.util.strtobool, 'mean_std.npz',
help="Use sortagrad or not. (default: %(default)s)") "Filepath of normalizer's mean & std.")
parser.add_argument( add_arg('vocab_path', str,
"--specgram_type", 'datasets/vocab/eng_vocab.txt',
default='linear', "Filepath of vocabulary.")
type=str, # configurations of model io
help="Feature type of audio data: 'linear' (power spectrum)" add_arg('init_model_path', str,
" or 'mfcc'. (default: %(default)s)") None,
parser.add_argument( "If None, the training starts from scratch, "
"--max_duration", "otherwise, it resumes from the pre-trained model.")
default=27.0, add_arg('output_model_dir', str,
type=float, "./checkpoints",
help="Audios with duration larger than this will be discarded. " "Directory for saving checkpoints.")
"(default: %(default)s)")
parser.add_argument(
"--min_duration",
default=0.0,
type=float,
help="Audios with duration smaller than this will be discarded. "
"(default: %(default)s)")
parser.add_argument(
"--shuffle_method",
default='batch_shuffle_clipped',
type=str,
help="Shuffle method: 'instance_shuffle', 'batch_shuffle', "
"'batch_shuffle_batch'. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--num_threads_data",
default=multiprocessing.cpu_count() // 2,
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--train_manifest_path",
default='datasets/manifest.train',
type=str,
help="Manifest path for training. (default: %(default)s)")
parser.add_argument(
"--dev_manifest_path",
default='datasets/manifest.dev',
type=str,
help="Manifest path for validation. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--init_model_path",
default=None,
type=str,
help="If set None, the training will start from scratch. "
"Otherwise, the training will resume from "
"the existing model of this path. (default: %(default)s)")
parser.add_argument(
"--output_model_dir",
default="./checkpoints",
type=str,
help="Directory for saving models. (default: %(default)s)")
parser.add_argument(
"--augmentation_config",
default=open('conf/augmentation.config', 'r').read(),
type=str,
help="Augmentation configuration in json-format. "
"(default: %(default)s)")
parser.add_argument(
"--is_local",
default=True,
type=distutils.util.strtobool,
help="Set to false if running with pserver in paddlecloud. "
"(default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
# yapf: disable
def train(): def train():
"""DeepSpeech2 training.""" """DeepSpeech2 training."""
train_generator = DataGenerator( train_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config=args.augmentation_config, augmentation_config=open(args.augment_conf_path, 'r').read(),
max_duration=args.max_duration, max_duration=args.max_duration,
min_duration=args.min_duration, min_duration=args.min_duration,
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.parallels_data)
dev_generator = DataGenerator( dev_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config="{}", augmentation_config="{}",
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.parallels_data)
train_batch_reader = train_generator.batch_reader_creator( train_batch_reader = train_generator.batch_reader_creator(
manifest_path=args.train_manifest_path, manifest_path=args.train_manifest,
batch_size=args.batch_size, batch_size=args.batch_size,
min_batch_size=args.trainer_count, min_batch_size=args.trainer_count,
sortagrad=args.use_sortagrad if args.init_model_path is None else False, sortagrad=args.use_sortagrad if args.init_model_path is None else False,
shuffle_method=args.shuffle_method) shuffle_method=args.shuffle_method)
dev_batch_reader = dev_generator.batch_reader_creator( dev_batch_reader = dev_generator.batch_reader_creator(
manifest_path=args.dev_manifest_path, manifest_path=args.dev_manifest,
batch_size=args.batch_size, batch_size=args.batch_size,
min_batch_size=1, # must be 1, but will have errors. min_batch_size=1, # must be 1, but will have errors.
sortagrad=False, sortagrad=False,
@ -184,21 +118,28 @@ def train():
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.init_model_path, pretrained_model_path=args.init_model_path,
share_rnn_weights=args.share_rnn_weights) share_rnn_weights=args.share_weights)
ds2_model.train( ds2_model.train(
train_batch_reader=train_batch_reader, train_batch_reader=train_batch_reader,
dev_batch_reader=dev_batch_reader, dev_batch_reader=dev_batch_reader,
feeding_dict=train_generator.feeding, feeding_dict=train_generator.feeding,
learning_rate=args.adam_learning_rate, learning_rate=args.learning_rate,
gradient_clipping=400, gradient_clipping=400,
num_passes=args.num_passes, num_passes=args.num_passes,
num_iterations_print=args.num_iterations_print, num_iterations_print=args.num_iter_print,
output_model_dir=args.output_model_dir, output_model_dir=args.output_model_dir,
is_local=args.is_local) is_local=args.is_local)
def print_arguments(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def main(): def main():
utils.print_arguments(args) print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
train() train()

@ -1,4 +1,4 @@
"""Parameters tuning for DeepSpeech2 model.""" """Beam search parameters tuning for DeepSpeech2 model."""
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
@ -11,134 +11,71 @@ import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import DeepSpeech2Model from model import DeepSpeech2Model
from error_rate import wer from error_rate import wer
import utils
NUM_CPU = multiprocessing.cpu_count() // 2
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--num_samples",
default=100, def add_arg(argname, type, default, help, **kwargs):
type=int, type = distutils.util.strtobool if type == bool else type
help="Number of samples for parameters tuning. (default: %(default)s)") parser.add_argument(
parser.add_argument( "--" + argname,
"--num_conv_layers", default=default,
default=2, type=type,
type=int, help=help + ' Default: %(default)s.',
help="Convolution layer number. (default: %(default)s)") **kwargs)
parser.add_argument(
"--num_rnn_layers",
default=3, # yapf: disable
type=int, # configurations of overall
help="RNN layer number. (default: %(default)s)") add_arg('num_samples', int, 100, "# of samples to infer.")
parser.add_argument( add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
"--rnn_layer_size", add_arg('use_gpu', bool, True, "Use GPU or not.")
default=2048, add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.",
type=int, choices=['wer', 'cer'])
help="RNN layer cell number. (default: %(default)s)") # configurations of tuning parameters
parser.add_argument( add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.")
"--share_rnn_weights", add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.")
default=True, add_arg('num_alphas', int, 14, "# of alpha candidates for tuning.")
type=distutils.util.strtobool, add_arg('beta_from', float, 0.05, "Where beta starts tuning from.")
help="Whether to share input-hidden weights between forword and backward " add_arg('beta_to', float, 0.36, "Where beta ends tuning with.")
"directional simple RNNs. Only available when use_gru=False. " add_arg('num_betas', int, 20, "# of beta candidates for tuning.")
"(default: %(default)s)") # configurations of decoder
parser.add_argument( add_arg('beam_size', int, 500, "Beam search width.")
"--use_gru", add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
default=False, add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.")
type=distutils.util.strtobool, add_arg('lang_model_path', str,
help="Use GRU or simple RNN. (default: %(default)s)") 'lm/data/common_crawl_00.prune01111.trie.klm',
parser.add_argument( "Filepath for language model.")
"--use_gpu", # configurations of data preprocess
default=True, add_arg('specgram_type', str,
type=distutils.util.strtobool, 'linear',
help="Use gpu or not. (default: %(default)s)") "Audio feature type. Options: linear, mfcc.",
parser.add_argument( choices=['linear', 'mfcc'])
"--trainer_count", # configurations of model structure
default=8, add_arg('num_conv_layers', int, 2, "# of convolution layers.")
type=int, add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
help="Trainer number. (default: %(default)s)") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
parser.add_argument( add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.")
"--num_threads_data", add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
default=1, "bi-directional RNNs. Not for GRU.")
type=int, # configurations of data io
help="Number of cpu threads for preprocessing data. (default: %(default)s)") add_arg('tune_manifest', str,
parser.add_argument( 'datasets/manifest.test',
"--num_processes_beam_search", "Filepath of manifest to tune.")
default=multiprocessing.cpu_count() // 2, add_arg('mean_std_path', str,
type=int, 'mean_std.npz',
help="Number of cpu processes for beam search. (default: %(default)s)") "Filepath of normalizer's mean & std.")
parser.add_argument( add_arg('vocab_path', str,
"--specgram_type", 'datasets/vocab/eng_vocab.txt',
default='linear', "Filepath of vocabulary.")
type=str, # configurations of model io
help="Feature type of audio data: 'linear' (power spectrum)" add_arg('model_path', str,
" or 'mfcc'. (default: %(default)s)") './checkpoints/params.latest.tar.gz',
parser.add_argument( "If None, the training starts from scratch, "
"--mean_std_filepath", "otherwise, it resumes from the pre-trained model.")
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--tune_manifest_path",
default='datasets/manifest.dev',
type=str,
help="Manifest path for tuning. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--beam_size",
default=500,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha_from",
default=0.1,
type=float,
help="Where alpha starts from. (default: %(default)f)")
parser.add_argument(
"--num_alphas",
default=14,
type=int,
help="Number of candidate alphas. (default: %(default)d)")
parser.add_argument(
"--alpha_to",
default=0.36,
type=float,
help="Where alpha ends with. (default: %(default)f)")
parser.add_argument(
"--beta_from",
default=0.05,
type=float,
help="Where beta starts from. (default: %(default)f)")
parser.add_argument(
"--num_betas",
default=20,
type=float,
help="Number of candidate betas. (default: %(default)d)")
parser.add_argument(
"--beta_to",
default=1.0,
type=float,
help="Where beta ends with. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
args = parser.parse_args() args = parser.parse_args()
# yapf: disable
def tune(): def tune():
@ -149,13 +86,13 @@ def tune():
raise ValueError("num_betas must be non-negative!") raise ValueError("num_betas must be non-negative!")
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=1)
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.tune_manifest_path, manifest_path=args.tune_manifest,
batch_size=args.num_samples, batch_size=args.num_samples,
sortagrad=False, sortagrad=False,
shuffle_method=None) shuffle_method=None)
@ -171,7 +108,7 @@ def tune():
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.model_filepath, pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights) share_rnn_weights=args.share_rnn_weights)
# create grid for search # create grid for search
@ -184,14 +121,14 @@ def tune():
for alpha, beta in params_grid: for alpha, beta in params_grid:
result_transcripts = ds2_model.infer_batch( result_transcripts = ds2_model.infer_batch(
infer_data=tune_data, infer_data=tune_data,
decode_method='beam_search', decoder_method='ctc_beam_search',
beam_alpha=alpha, beam_alpha=alpha,
beam_beta=beta, beam_beta=beta,
beam_size=args.beam_size, beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob, cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list, vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path, language_model_path=args.lang_model_path,
num_processes=args.num_processes_beam_search) num_processes=args.parallels_bsearch)
wer_sum, num_ins = 0.0, 0 wer_sum, num_ins = 0.0, 0
for target, result in zip(target_transcripts, result_transcripts): for target, result in zip(target_transcripts, result_transcripts):
wer_sum += wer(target, result) wer_sum += wer(target, result)
@ -200,8 +137,15 @@ def tune():
(alpha, beta, wer_sum / num_ins)) (alpha, beta, wer_sum / num_ins))
def print_arguments(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def main(): def main():
utils.print_arguments(args) print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
tune() tune()

@ -1,25 +0,0 @@
"""Contains common utility functions."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
def print_arguments(args):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
print("----- Configuration Arguments -----")
for arg, value in vars(args).iteritems():
print("%s: %s" % (arg, value))
print("------------------------------------")
Loading…
Cancel
Save