Reduce the config parsing codes for DS2 and make it looks cleaner.

pull/2/head
Xinghai Sun 7 years ago
parent 2aa4af1c29
commit 805846ce67

@ -9,8 +9,9 @@ from math import log
import multiprocessing
def ctc_best_path_decoder(probs_seq, vocabulary):
"""Best path decoder, also called argmax decoder or greedy decoder.
def ctc_greedy_decoder(probs_seq, vocabulary):
"""CTC greedy (best path) decoder.
Path consisting of the most probable tokens are further post-processed to
remove consecutive repetitions and all blanks.
@ -45,10 +46,12 @@ def ctc_beam_search_decoder(probs_seq,
cutoff_prob=1.0,
ext_scoring_func=None,
nproc=False):
"""Beam search decoder for CTC-trained network. It utilizes beam search
to approximately select top best decoding labels and returning results
in the descending order. The implementation is based on Prefix
Beam Search (https://arxiv.org/abs/1408.2873), and the unclear part is
"""CTC Beam search decoder.
It utilizes beam search to approximately select top best decoding
labels and returning results in the descending order.
The implementation is based on Prefix Beam Search
(https://arxiv.org/abs/1408.2873), and the unclear part is
redesigned. Two important modifications: 1) in the iterative computation
of probabilities, the assignment operation is changed to accumulation for
one prefix may comes from different paths; 2) the if condition "if l^+ not

@ -9,118 +9,74 @@ import SocketServer
import struct
import wave
import paddle.v2 as paddle
from utils import print_arguments
from data_utils.data import DataGenerator
from model import DeepSpeech2Model
from data_utils.utils import read_manifest
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--host_ip",
default="localhost",
type=str,
help="Server IP address. (default: %(default)s)")
parser.add_argument(
"--host_port",
default=8086,
type=int,
help="Server Port. (default: %(default)s)")
parser.add_argument(
"--speech_save_dir",
default="demo_cache",
type=str,
help="Directory for saving demo speech. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--warmup_manifest_path",
default='datasets/manifest.test',
type=str,
help="Manifest path for warmup test. (default: %(default)s)")
parser.add_argument(
"--specgram_type",
default='linear',
type=str,
help="Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)")
parser.add_argument(
"--num_conv_layers",
default=2,
type=int,
help="Convolution layer number. (default: %(default)s)")
parser.add_argument(
"--num_rnn_layers",
default=3,
type=int,
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=2048,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument(
"--use_gru",
default=False,
type=distutils.util.strtobool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--decode_method",
default='beam_search',
type=str,
help="Method for ctc decoding: best_path or beam_search. "
"(default: %(default)s)")
parser.add_argument(
"--beam_size",
default=100,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha",
default=0.36,
type=float,
help="Parameter associated with language model. (default: %(default)f)")
parser.add_argument(
"--beta",
default=0.25,
type=float,
help="Parameter associated with word count. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
def add_arg(argname, type, default, help, **kwargs):
type = distutils.util.strtobool if type == bool else type
parser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
# yapf: disable
# configurations of overall
add_arg('host_port', int, 8086, "Server's IP port.")
add_arg('host_ip', str,
'localhost',
"Server's IP address.")
add_arg('speech_save_dir', str,
'demo_cache',
"Directory to save demo audios.")
add_arg('use_gpu', bool, True, "Use GPU or not.")
# configurations of decoder
add_arg('beam_size', int, 500, "Beam search width.")
add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
add_arg('beta', float, 0.25, "Coef of WC for beam search.")
add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.")
add_arg('decoder_method', str,
'ctc_beam_search',
"Decoder method. Options: ctc_beam_search, ctc_greedy",
choices = ['ctc_beam_search', 'ctc_greedy'])
# configurations of data preprocess
add_arg('specgram_type', str,
'linear',
"Audio feature type. Options: linear, mfcc.",
choices=['linear', 'mfcc'])
# configurations of model structure
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.")
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"bi-directional RNNs. Not for GRU.")
# configurations of data io
add_arg('warmup_manifest', str,
'datasets/manifest.test',
"Filepath of manifest to warm up.")
add_arg('mean_std_path', str,
'mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'datasets/vocab/eng_vocab.txt',
"Filepath of vocabulary.")
# configurations of model io
add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz',
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.")
args = parser.parse_args()
# yapf: disable
class AsrTCPServer(SocketServer.TCPServer):
@ -200,8 +156,8 @@ def start_server():
"""Start the ASR server"""
# prepare data generator
data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath,
vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_path,
augmentation_config='{}',
specgram_type=args.specgram_type,
num_threads=1)
@ -212,7 +168,7 @@ def start_server():
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath,
pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights)
# prepare ASR inference handler
@ -220,13 +176,13 @@ def start_server():
feature = data_generator.process_utterance(filename, "")
result_transcript = ds2_model.infer_batch(
infer_data=[feature],
decode_method=args.decode_method,
decoder_method=args.decoder_method,
beam_alpha=args.alpha,
beam_beta=args.beta,
beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path,
language_model_path=args.lang_model_path,
num_processes=1)
return result_transcript[0]
@ -235,7 +191,7 @@ def start_server():
print('Warming up ...')
warm_up_test(
audio_process_handler=file_to_transcript,
manifest_path=args.warmup_manifest_path,
manifest_path=args.warmup_manifest,
num_test_cases=3)
print('-----------------------------------------------------------')
@ -249,6 +205,13 @@ def start_server():
server.serve_forever()
def print_arguments(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def main():
print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=1)

@ -10,140 +10,83 @@ import paddle.v2 as paddle
from data_utils.data import DataGenerator
from model import DeepSpeech2Model
from error_rate import wer, cer
import utils
NUM_CPU = multiprocessing.cpu_count() // 2
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--batch_size",
default=128,
type=int,
help="Minibatch size for evaluation. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--num_conv_layers",
default=2,
type=int,
help="Convolution layer number. (default: %(default)s)")
parser.add_argument(
"--num_rnn_layers",
default=3,
type=int,
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=2048,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument(
"--use_gru",
default=False,
type=distutils.util.strtobool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--num_threads_data",
default=multiprocessing.cpu_count() // 2,
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument(
"--num_processes_beam_search",
default=multiprocessing.cpu_count() // 2,
type=int,
help="Number of cpu processes for beam search. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--decode_method",
default='beam_search',
type=str,
help="Method for ctc decoding, best_path or beam_search. "
"(default: %(default)s)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha",
default=0.36,
type=float,
help="Parameter associated with language model. (default: %(default)f)")
parser.add_argument(
"--beta",
default=0.25,
type=float,
help="Parameter associated with word count. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
parser.add_argument(
"--beam_size",
default=500,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--specgram_type",
default='linear',
type=str,
help="Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)")
parser.add_argument(
"--decode_manifest_path",
default='datasets/manifest.test',
type=str,
help="Manifest path for decoding. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--error_rate_type",
default='wer',
choices=['wer', 'cer'],
type=str,
help="Error rate type for evaluation. 'wer' for word error rate and 'cer' "
"for character error rate. "
"(default: %(default)s)")
def add_arg(argname, type, default, help, **kwargs):
type = distutils.util.strtobool if type == bool else type
parser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
# yapf: disable
# configurations of overall
add_arg('batch_size', int, 128, "Minibatch size.")
add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
add_arg('use_gpu', bool, True, "Use GPU or not.")
add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.",
choices=['wer', 'cer'])
# configurations of decoder
add_arg('beam_size', int, 500, "Beam search width.")
add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
add_arg('beta', float, 0.25, "Coef of WC for beam search.")
add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.")
add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.")
add_arg('decoder_method', str,
'ctc_beam_search',
"Decoder method. Options: ctc_beam_search, ctc_greedy",
choices = ['ctc_beam_search', 'ctc_greedy'])
# configurations of data preprocess
add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.")
add_arg('specgram_type', str,
'linear',
"Audio feature type. Options: linear, mfcc.",
choices=['linear', 'mfcc'])
# configurations of model structure
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.")
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"bi-directional RNNs. Not for GRU.")
# configurations of data io
add_arg('test_manifest', str,
'datasets/manifest.test',
"Filepath of manifest to evaluate.")
add_arg('mean_std_path', str,
'mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'datasets/vocab/eng_vocab.txt',
"Filepath of vocabulary.")
# configurations of model io
add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz',
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.")
args = parser.parse_args()
# yapf: disable
def evaluate():
"""Evaluate on whole test data for DeepSpeech2."""
data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath,
vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_path,
augmentation_config='{}',
specgram_type=args.specgram_type,
num_threads=args.num_threads_data)
num_threads=args.parallels_data)
batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path,
manifest_path=args.test_manifest,
batch_size=args.batch_size,
min_batch_size=1,
sortagrad=False,
@ -155,7 +98,7 @@ def evaluate():
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath,
pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights)
error_rate_func = cer if args.error_rate_type == 'cer' else wer
@ -163,14 +106,14 @@ def evaluate():
for infer_data in batch_reader():
result_transcripts = ds2_model.infer_batch(
infer_data=infer_data,
decode_method=args.decode_method,
decoder_method=args.decoder_method,
beam_alpha=args.alpha,
beam_beta=args.beta,
beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path,
num_processes=args.num_processes_beam_search)
language_model_path=args.lang_model_path,
num_processes=args.parallels_bsearch)
target_transcripts = [
''.join([data_generator.vocab_list[token] for token in transcript])
for _, transcript in infer_data
@ -184,8 +127,15 @@ def evaluate():
(args.error_rate_type, num_ins, num_ins, error_sum / num_ins))
def print_arguments(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def main():
utils.print_arguments(args)
print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
evaluate()

@ -10,140 +10,82 @@ import paddle.v2 as paddle
from data_utils.data import DataGenerator
from model import DeepSpeech2Model
from error_rate import wer, cer
import utils
NUM_CPU = multiprocessing.cpu_count() // 2
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--num_samples",
default=10,
type=int,
help="Number of samples for inference. (default: %(default)s)")
parser.add_argument(
"--num_conv_layers",
default=2,
type=int,
help="Convolution layer number. (default: %(default)s)")
parser.add_argument(
"--num_rnn_layers",
default=3,
type=int,
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=2048,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument(
"--use_gru",
default=False,
type=distutils.util.strtobool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--num_threads_data",
default=1,
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument(
"--num_processes_beam_search",
default=multiprocessing.cpu_count() // 2,
type=int,
help="Number of cpu processes for beam search. (default: %(default)s)")
parser.add_argument(
"--specgram_type",
default='linear',
type=str,
help="Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--decode_manifest_path",
default='datasets/manifest.test',
type=str,
help="Manifest path for decoding. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--decode_method",
default='beam_search',
type=str,
help="Method for ctc decoding: best_path or beam_search. "
"(default: %(default)s)")
parser.add_argument(
"--beam_size",
default=500,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha",
default=0.36,
type=float,
help="Parameter associated with language model. (default: %(default)f)")
parser.add_argument(
"--beta",
default=0.25,
type=float,
help="Parameter associated with word count. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
parser.add_argument(
"--error_rate_type",
default='wer',
choices=['wer', 'cer'],
type=str,
help="Error rate type for evaluation. 'wer' for word error rate and 'cer' "
"for character error rate. "
"(default: %(default)s)")
def add_arg(argname, type, default, help, **kwargs):
type = distutils.util.strtobool if type == bool else type
parser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
# yapf: disable
# configurations of overall
add_arg('num_samples', int, 10, "# of samples to infer.")
add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
add_arg('use_gpu', bool, True, "Use GPU or not.")
add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.",
choices=['wer', 'cer'])
# configurations of decoder
add_arg('beam_size', int, 500, "Beam search width.")
add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
add_arg('beta', float, 0.25, "Coef of WC for beam search.")
add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.")
add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.")
add_arg('decoder_method', str,
'ctc_beam_search',
"Decoder method. Options: ctc_beam_search, ctc_greedy",
choices = ['ctc_beam_search', 'ctc_greedy'])
# configurations of data preprocess
add_arg('specgram_type', str,
'linear',
"Audio feature type. Options: linear, mfcc.",
choices=['linear', 'mfcc'])
# configurations of model structure
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.")
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"bi-directional RNNs. Not for GRU.")
# configurations of data io
add_arg('infer_manifest', str,
'datasets/manifest.dev',
"Filepath of manifest to infer.")
add_arg('mean_std_path', str,
'mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'datasets/vocab/eng_vocab.txt',
"Filepath of vocabulary.")
# configurations of model io
add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz',
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.")
args = parser.parse_args()
# yapf: disable
def infer():
"""Inference for DeepSpeech2."""
data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath,
vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_path,
augmentation_config='{}',
specgram_type=args.specgram_type,
num_threads=args.num_threads_data)
num_threads=1)
batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path,
manifest_path=args.infer_manifest,
batch_size=args.num_samples,
min_batch_size=1,
sortagrad=False,
@ -156,18 +98,18 @@ def infer():
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath,
pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights)
result_transcripts = ds2_model.infer_batch(
infer_data=infer_data,
decode_method=args.decode_method,
decoder_method=args.decoder_method,
beam_alpha=args.alpha,
beam_beta=args.beta,
beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path,
num_processes=args.num_processes_beam_search)
language_model_path=args.lang_model_path,
num_processes=args.parallels_bsearch)
error_rate_func = cer if args.error_rate_type == 'cer' else wer
target_transcripts = [
@ -181,8 +123,15 @@ def infer():
(args.error_rate_type, error_rate_func(target, result)))
def print_arguments(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def main():
utils.print_arguments(args)
print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
infer()

@ -146,7 +146,7 @@ class DeepSpeech2Model(object):
# run inference
return self._loss_inferer.infer(input=infer_data)
def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta,
def infer_batch(self, infer_data, decoder_method, beam_alpha, beam_beta,
beam_size, cutoff_prob, vocab_list, language_model_path,
num_processes):
"""Model inference. Infer the transcription for a batch of speech
@ -156,9 +156,9 @@ class DeepSpeech2Model(object):
consisting of a tuple of audio features and
transcription text (empty string).
:type infer_data: list
:param decode_method: Decoding method name, 'best_path' or
'beam search'.
:param decode_method: string
:param decoder_method: Decoding method name, 'ctc_greedy' or
'ctc_beam_search'.
:param decoder_method: string
:param beam_alpha: Parameter associated with language model.
:type beam_alpha: float
:param beam_beta: Parameter associated with word count.
@ -190,13 +190,13 @@ class DeepSpeech2Model(object):
]
# run decoder
results = []
if decode_method == "best_path":
if decoder_method == "ctc_greedy":
# best path decode
for i, probs in enumerate(probs_split):
output_transcription = ctc_best_path_decoder(
output_transcription = ctc_greedy_decoder(
probs_seq=probs, vocabulary=vocab_list)
results.append(output_transcription)
elif decode_method == "beam_search":
elif decoder_method == "ctc_beam_search":
# initialize external scorer
if self._ext_scorer == None:
self._ext_scorer = LmScorer(beam_alpha, beam_beta,
@ -205,7 +205,6 @@ class DeepSpeech2Model(object):
else:
self._ext_scorer.reset_params(beam_alpha, beam_beta)
assert self._loaded_lm_path == language_model_path
# beam search decode
beam_search_results = ctc_beam_search_decoder_batch(
probs_split=probs_split,
@ -218,8 +217,8 @@ class DeepSpeech2Model(object):
results = [result[0][1] for result in beam_search_results]
else:
raise ValueError("Decoding method [%s] is not supported." %
decode_method)
raise ValueError("Decoder method [%s] is not supported." %
decoder_method)
return results
def _create_parameters(self, model_path=None):

@ -9,169 +9,103 @@ import multiprocessing
import paddle.v2 as paddle
from model import DeepSpeech2Model
from data_utils.data import DataGenerator
import utils
NUM_CPU = multiprocessing.cpu_count() // 2
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--batch_size", default=256, type=int, help="Minibatch size.")
parser.add_argument(
"--num_passes",
default=200,
type=int,
help="Training pass number. (default: %(default)s)")
parser.add_argument(
"--num_iterations_print",
default=100,
type=int,
help="Number of iterations for every train cost printing. "
"(default: %(default)s)")
parser.add_argument(
"--num_conv_layers",
default=2,
type=int,
help="Convolution layer number. (default: %(default)s)")
parser.add_argument(
"--num_rnn_layers",
default=3,
type=int,
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=2048,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument(
"--use_gru",
default=False,
type=distutils.util.strtobool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument(
"--adam_learning_rate",
default=5e-4,
type=float,
help="Learning rate for ADAM Optimizer. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--use_sortagrad",
default=True,
type=distutils.util.strtobool,
help="Use sortagrad or not. (default: %(default)s)")
parser.add_argument(
"--specgram_type",
default='linear',
type=str,
help="Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)")
parser.add_argument(
"--max_duration",
default=27.0,
type=float,
help="Audios with duration larger than this will be discarded. "
"(default: %(default)s)")
parser.add_argument(
"--min_duration",
default=0.0,
type=float,
help="Audios with duration smaller than this will be discarded. "
"(default: %(default)s)")
parser.add_argument(
"--shuffle_method",
default='batch_shuffle_clipped',
type=str,
help="Shuffle method: 'instance_shuffle', 'batch_shuffle', "
"'batch_shuffle_batch'. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--num_threads_data",
default=multiprocessing.cpu_count() // 2,
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--train_manifest_path",
default='datasets/manifest.train',
type=str,
help="Manifest path for training. (default: %(default)s)")
parser.add_argument(
"--dev_manifest_path",
default='datasets/manifest.dev',
type=str,
help="Manifest path for validation. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--init_model_path",
default=None,
type=str,
help="If set None, the training will start from scratch. "
"Otherwise, the training will resume from "
"the existing model of this path. (default: %(default)s)")
parser.add_argument(
"--output_model_dir",
default="./checkpoints",
type=str,
help="Directory for saving models. (default: %(default)s)")
parser.add_argument(
"--augmentation_config",
default=open('conf/augmentation.config', 'r').read(),
type=str,
help="Augmentation configuration in json-format. "
"(default: %(default)s)")
parser.add_argument(
"--is_local",
default=True,
type=distutils.util.strtobool,
help="Set to false if running with pserver in paddlecloud. "
"(default: %(default)s)")
def add_arg(argname, type, default, help, **kwargs):
type = distutils.util.strtobool if type == bool else type
parser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
# yapf: disable
# configurations of optimization
add_arg('batch_size', int, 256, "Minibatch size.")
add_arg('learning_rate', float, 5e-4, "Learning rate.")
add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.")
add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
add_arg('use_gpu', bool, True, "Use GPU or not.")
add_arg('num_passes', int, 200, "# of training epochs.")
add_arg('is_local', bool, True, "Use pserver or not.")
add_arg('num_iter_print', int, 100, "Every # iterations for printing "
"train cost.")
# configurations of data preprocess
add_arg('max_duration', float, 27.0, "Longest audio duration allowed.")
add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.")
add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.")
add_arg('specgram_type', str,
'linear',
"Audio feature type. Options: linear, mfcc.",
choices=['linear', 'mfcc'])
add_arg('augment_conf_path',str,
'conf/augmentation.config',
"Filepath of augmentation configuration file (json-format).")
add_arg('shuffle_method', str,
'batch_shuffle_clipped',
"Shuffle method.",
choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped'])
# configurations of model structure
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.")
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"bi-directional RNNs. Not for GRU.")
# configurations of data io
add_arg('train_manifest', str,
'datasets/manifest.train',
"Filepath of train manifest.")
add_arg('dev_manifest', str,
'datasets/manifest.dev',
"Filepath of validation manifest.")
add_arg('mean_std_path', str,
'mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'datasets/vocab/eng_vocab.txt',
"Filepath of vocabulary.")
# configurations of model io
add_arg('init_model_path', str,
None,
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.")
add_arg('output_model_dir', str,
"./checkpoints",
"Directory for saving checkpoints.")
args = parser.parse_args()
# yapf: disable
def train():
"""DeepSpeech2 training."""
train_generator = DataGenerator(
vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath,
augmentation_config=args.augmentation_config,
vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_path,
augmentation_config=open(args.augment_conf_path, 'r').read(),
max_duration=args.max_duration,
min_duration=args.min_duration,
specgram_type=args.specgram_type,
num_threads=args.num_threads_data)
num_threads=args.parallels_data)
dev_generator = DataGenerator(
vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath,
vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_path,
augmentation_config="{}",
specgram_type=args.specgram_type,
num_threads=args.num_threads_data)
num_threads=args.parallels_data)
train_batch_reader = train_generator.batch_reader_creator(
manifest_path=args.train_manifest_path,
manifest_path=args.train_manifest,
batch_size=args.batch_size,
min_batch_size=args.trainer_count,
sortagrad=args.use_sortagrad if args.init_model_path is None else False,
shuffle_method=args.shuffle_method)
dev_batch_reader = dev_generator.batch_reader_creator(
manifest_path=args.dev_manifest_path,
manifest_path=args.dev_manifest,
batch_size=args.batch_size,
min_batch_size=1, # must be 1, but will have errors.
sortagrad=False,
@ -184,21 +118,28 @@ def train():
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.init_model_path,
share_rnn_weights=args.share_rnn_weights)
share_rnn_weights=args.share_weights)
ds2_model.train(
train_batch_reader=train_batch_reader,
dev_batch_reader=dev_batch_reader,
feeding_dict=train_generator.feeding,
learning_rate=args.adam_learning_rate,
learning_rate=args.learning_rate,
gradient_clipping=400,
num_passes=args.num_passes,
num_iterations_print=args.num_iterations_print,
num_iterations_print=args.num_iter_print,
output_model_dir=args.output_model_dir,
is_local=args.is_local)
def print_arguments(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def main():
utils.print_arguments(args)
print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
train()

@ -1,4 +1,4 @@
"""Parameters tuning for DeepSpeech2 model."""
"""Beam search parameters tuning for DeepSpeech2 model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
@ -11,134 +11,71 @@ import paddle.v2 as paddle
from data_utils.data import DataGenerator
from model import DeepSpeech2Model
from error_rate import wer
import utils
NUM_CPU = multiprocessing.cpu_count() // 2
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--num_samples",
default=100,
type=int,
help="Number of samples for parameters tuning. (default: %(default)s)")
parser.add_argument(
"--num_conv_layers",
default=2,
type=int,
help="Convolution layer number. (default: %(default)s)")
parser.add_argument(
"--num_rnn_layers",
default=3,
type=int,
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=2048,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument(
"--use_gru",
default=False,
type=distutils.util.strtobool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--num_threads_data",
default=1,
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument(
"--num_processes_beam_search",
default=multiprocessing.cpu_count() // 2,
type=int,
help="Number of cpu processes for beam search. (default: %(default)s)")
parser.add_argument(
"--specgram_type",
default='linear',
type=str,
help="Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--tune_manifest_path",
default='datasets/manifest.dev',
type=str,
help="Manifest path for tuning. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--beam_size",
default=500,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha_from",
default=0.1,
type=float,
help="Where alpha starts from. (default: %(default)f)")
parser.add_argument(
"--num_alphas",
default=14,
type=int,
help="Number of candidate alphas. (default: %(default)d)")
parser.add_argument(
"--alpha_to",
default=0.36,
type=float,
help="Where alpha ends with. (default: %(default)f)")
parser.add_argument(
"--beta_from",
default=0.05,
type=float,
help="Where beta starts from. (default: %(default)f)")
parser.add_argument(
"--num_betas",
default=20,
type=float,
help="Number of candidate betas. (default: %(default)d)")
parser.add_argument(
"--beta_to",
default=1.0,
type=float,
help="Where beta ends with. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
def add_arg(argname, type, default, help, **kwargs):
type = distutils.util.strtobool if type == bool else type
parser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
# yapf: disable
# configurations of overall
add_arg('num_samples', int, 100, "# of samples to infer.")
add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
add_arg('use_gpu', bool, True, "Use GPU or not.")
add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.",
choices=['wer', 'cer'])
# configurations of tuning parameters
add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.")
add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.")
add_arg('num_alphas', int, 14, "# of alpha candidates for tuning.")
add_arg('beta_from', float, 0.05, "Where beta starts tuning from.")
add_arg('beta_to', float, 0.36, "Where beta ends tuning with.")
add_arg('num_betas', int, 20, "# of beta candidates for tuning.")
# configurations of decoder
add_arg('beam_size', int, 500, "Beam search width.")
add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.")
add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.")
# configurations of data preprocess
add_arg('specgram_type', str,
'linear',
"Audio feature type. Options: linear, mfcc.",
choices=['linear', 'mfcc'])
# configurations of model structure
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.")
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"bi-directional RNNs. Not for GRU.")
# configurations of data io
add_arg('tune_manifest', str,
'datasets/manifest.test',
"Filepath of manifest to tune.")
add_arg('mean_std_path', str,
'mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'datasets/vocab/eng_vocab.txt',
"Filepath of vocabulary.")
# configurations of model io
add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz',
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.")
args = parser.parse_args()
# yapf: disable
def tune():
@ -149,13 +86,13 @@ def tune():
raise ValueError("num_betas must be non-negative!")
data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath,
vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_path,
augmentation_config='{}',
specgram_type=args.specgram_type,
num_threads=args.num_threads_data)
num_threads=1)
batch_reader = data_generator.batch_reader_creator(
manifest_path=args.tune_manifest_path,
manifest_path=args.tune_manifest,
batch_size=args.num_samples,
sortagrad=False,
shuffle_method=None)
@ -171,7 +108,7 @@ def tune():
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath,
pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights)
# create grid for search
@ -184,14 +121,14 @@ def tune():
for alpha, beta in params_grid:
result_transcripts = ds2_model.infer_batch(
infer_data=tune_data,
decode_method='beam_search',
decoder_method='ctc_beam_search',
beam_alpha=alpha,
beam_beta=beta,
beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path,
num_processes=args.num_processes_beam_search)
language_model_path=args.lang_model_path,
num_processes=args.parallels_bsearch)
wer_sum, num_ins = 0.0, 0
for target, result in zip(target_transcripts, result_transcripts):
wer_sum += wer(target, result)
@ -200,8 +137,15 @@ def tune():
(alpha, beta, wer_sum / num_ins))
def print_arguments(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def main():
utils.print_arguments(args)
print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
tune()

@ -1,25 +0,0 @@
"""Contains common utility functions."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
def print_arguments(args):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
print("----- Configuration Arguments -----")
for arg, value in vars(args).iteritems():
print("%s: %s" % (arg, value))
print("------------------------------------")
Loading…
Cancel
Save