Merge branch 'develop' of https://github.com/PaddlePaddle/models into ctc_decoder_deploy
commit
11ede80a48
@ -1,13 +0,0 @@
|
||||
cd librispeech
|
||||
python librispeech.py
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Prepare LibriSpeech failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
cd -
|
||||
|
||||
cat librispeech/manifest.train* | shuf > manifest.train
|
||||
cat librispeech/manifest.dev-clean > manifest.dev
|
||||
cat librispeech/manifest.test-clean > manifest.test
|
||||
|
||||
echo "All done."
|
@ -1,10 +0,0 @@
|
||||
cd noise
|
||||
python chime3_background.py
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Prepare CHiME3 background noise failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
cd -
|
||||
|
||||
cat noise/manifest.* > manifest.noise
|
||||
echo "All done."
|
@ -0,0 +1,19 @@
|
||||
"""Set up paths for DS2"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
|
||||
def add_path(path):
|
||||
if path not in sys.path:
|
||||
sys.path.insert(0, path)
|
||||
|
||||
|
||||
this_dir = os.path.dirname(__file__)
|
||||
|
||||
# Add project path to PYTHONPATH
|
||||
proj_path = os.path.join(this_dir, '..')
|
||||
add_path(proj_path)
|
@ -1,180 +0,0 @@
|
||||
"""Evaluation for DeepSpeech2 model."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import distutils.util
|
||||
import argparse
|
||||
import multiprocessing
|
||||
import paddle.v2 as paddle
|
||||
from data_utils.data import DataGenerator
|
||||
from model import DeepSpeech2Model
|
||||
from error_rate import wer, cer
|
||||
import utils
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--batch_size",
|
||||
default=128,
|
||||
type=int,
|
||||
help="Minibatch size for evaluation. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--trainer_count",
|
||||
default=8,
|
||||
type=int,
|
||||
help="Trainer number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--num_conv_layers",
|
||||
default=2,
|
||||
type=int,
|
||||
help="Convolution layer number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--num_rnn_layers",
|
||||
default=3,
|
||||
type=int,
|
||||
help="RNN layer number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--rnn_layer_size",
|
||||
default=512,
|
||||
type=int,
|
||||
help="RNN layer cell number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--use_gpu",
|
||||
default=True,
|
||||
type=distutils.util.strtobool,
|
||||
help="Use gpu or not. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--num_threads_data",
|
||||
default=multiprocessing.cpu_count() // 2,
|
||||
type=int,
|
||||
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--num_processes_beam_search",
|
||||
default=multiprocessing.cpu_count() // 2,
|
||||
type=int,
|
||||
help="Number of cpu processes for beam search. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--mean_std_filepath",
|
||||
default='mean_std.npz',
|
||||
type=str,
|
||||
help="Manifest path for normalizer. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--decode_method",
|
||||
default='beam_search',
|
||||
type=str,
|
||||
help="Method for ctc decoding, best_path or beam_search. "
|
||||
"(default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--language_model_path",
|
||||
default="lm/data/common_crawl_00.prune01111.trie.klm",
|
||||
type=str,
|
||||
help="Path for language model. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--alpha",
|
||||
default=0.36,
|
||||
type=float,
|
||||
help="Parameter associated with language model. (default: %(default)f)")
|
||||
parser.add_argument(
|
||||
"--beta",
|
||||
default=0.25,
|
||||
type=float,
|
||||
help="Parameter associated with word count. (default: %(default)f)")
|
||||
parser.add_argument(
|
||||
"--cutoff_prob",
|
||||
default=0.99,
|
||||
type=float,
|
||||
help="The cutoff probability of pruning"
|
||||
"in beam search. (default: %(default)f)")
|
||||
parser.add_argument(
|
||||
"--beam_size",
|
||||
default=500,
|
||||
type=int,
|
||||
help="Width for beam search decoding. (default: %(default)d)")
|
||||
parser.add_argument(
|
||||
"--specgram_type",
|
||||
default='linear',
|
||||
type=str,
|
||||
help="Feature type of audio data: 'linear' (power spectrum)"
|
||||
" or 'mfcc'. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--decode_manifest_path",
|
||||
default='datasets/manifest.test',
|
||||
type=str,
|
||||
help="Manifest path for decoding. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--model_filepath",
|
||||
default='checkpoints/params.latest.tar.gz',
|
||||
type=str,
|
||||
help="Model filepath. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--vocab_filepath",
|
||||
default='datasets/vocab/eng_vocab.txt',
|
||||
type=str,
|
||||
help="Vocabulary filepath. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--error_rate_type",
|
||||
default='wer',
|
||||
choices=['wer', 'cer'],
|
||||
type=str,
|
||||
help="Error rate type for evaluation. 'wer' for word error rate and 'cer' "
|
||||
"for character error rate. "
|
||||
"(default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def evaluate():
|
||||
"""Evaluate on whole test data for DeepSpeech2."""
|
||||
data_generator = DataGenerator(
|
||||
vocab_filepath=args.vocab_filepath,
|
||||
mean_std_filepath=args.mean_std_filepath,
|
||||
augmentation_config='{}',
|
||||
specgram_type=args.specgram_type,
|
||||
num_threads=args.num_threads_data)
|
||||
batch_reader = data_generator.batch_reader_creator(
|
||||
manifest_path=args.decode_manifest_path,
|
||||
batch_size=args.batch_size,
|
||||
min_batch_size=1,
|
||||
sortagrad=False,
|
||||
shuffle_method=None)
|
||||
|
||||
ds2_model = DeepSpeech2Model(
|
||||
vocab_size=data_generator.vocab_size,
|
||||
num_conv_layers=args.num_conv_layers,
|
||||
num_rnn_layers=args.num_rnn_layers,
|
||||
rnn_layer_size=args.rnn_layer_size,
|
||||
pretrained_model_path=args.model_filepath)
|
||||
|
||||
error_rate_func = cer if args.error_rate_type == 'cer' else wer
|
||||
error_sum, num_ins = 0.0, 0
|
||||
for infer_data in batch_reader():
|
||||
result_transcripts = ds2_model.infer_batch(
|
||||
infer_data=infer_data,
|
||||
decode_method=args.decode_method,
|
||||
beam_alpha=args.alpha,
|
||||
beam_beta=args.beta,
|
||||
beam_size=args.beam_size,
|
||||
cutoff_prob=args.cutoff_prob,
|
||||
vocab_list=data_generator.vocab_list,
|
||||
language_model_path=args.language_model_path,
|
||||
num_processes=args.num_processes_beam_search)
|
||||
target_transcripts = [
|
||||
''.join([data_generator.vocab_list[token] for token in transcript])
|
||||
for _, transcript in infer_data
|
||||
]
|
||||
for target, result in zip(target_transcripts, result_transcripts):
|
||||
error_sum += error_rate_func(target, result)
|
||||
num_ins += 1
|
||||
print("Error rate [%s] (%d/?) = %f" %
|
||||
(args.error_rate_type, num_ins, error_sum / num_ins))
|
||||
print("Final error rate [%s] (%d/%d) = %f" %
|
||||
(args.error_rate_type, num_ins, num_ins, error_sum / num_ins))
|
||||
|
||||
|
||||
def main():
|
||||
utils.print_arguments(args)
|
||||
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
|
||||
evaluate()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,28 @@
|
||||
#! /usr/bin/bash
|
||||
|
||||
pushd ../..
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 \
|
||||
python -u infer.py \
|
||||
--num_samples=10 \
|
||||
--trainer_count=1 \
|
||||
--beam_size=500 \
|
||||
--num_proc_bsearch=12 \
|
||||
--num_proc_data=12 \
|
||||
--num_conv_layers=2 \
|
||||
--num_rnn_layers=3 \
|
||||
--rnn_layer_size=2048 \
|
||||
--alpha=0.36 \
|
||||
--beta=0.25 \
|
||||
--cutoff_prob=0.99 \
|
||||
--use_gru=False \
|
||||
--use_gpu=True \
|
||||
--share_rnn_weights=True \
|
||||
--infer_manifest='data/librispeech/manifest.dev-clean' \
|
||||
--mean_std_path='data/librispeech/mean_std.npz' \
|
||||
--vocab_path='data/librispeech/eng_vocab.txt' \
|
||||
--model_path='checkpoints/params.latest.tar.gz' \
|
||||
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
|
||||
--decoding_method='ctc_beam_search' \
|
||||
--error_rate_type='wer' \
|
||||
--specgram_type='linear'
|
@ -0,0 +1,32 @@
|
||||
#! /usr/bin/bash
|
||||
|
||||
pushd ../..
|
||||
|
||||
# download data, generate manifests
|
||||
python data/librispeech/librispeech.py \
|
||||
--manifest_prefix='data/librispeech/manifest' \
|
||||
--full_download='True' \
|
||||
--target_dir='~/.cache/paddle/dataset/speech/Libri'
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Prepare LibriSpeech failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
|
||||
|
||||
|
||||
# compute mean and stddev for normalizer
|
||||
python tools/compute_mean_std.py \
|
||||
--manifest_path='data/librispeech/manifest.train' \
|
||||
--num_samples=2000 \
|
||||
--specgram_type='linear' \
|
||||
--output_path='data/librispeech/mean_std.npz'
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Compute mean and stddev failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
echo "LibriSpeech Data preparation done."
|
@ -0,0 +1,28 @@
|
||||
#! /usr/bin/bash
|
||||
|
||||
pushd ../..
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
python -u evaluate.py \
|
||||
--batch_size=128 \
|
||||
--trainer_count=8 \
|
||||
--beam_size=500 \
|
||||
--num_proc_bsearch=12 \
|
||||
--num_proc_data=12 \
|
||||
--num_conv_layers=2 \
|
||||
--num_rnn_layers=3 \
|
||||
--rnn_layer_size=2048 \
|
||||
--alpha=0.36 \
|
||||
--beta=0.25 \
|
||||
--cutoff_prob=0.99 \
|
||||
--use_gru=False \
|
||||
--use_gpu=True \
|
||||
--share_rnn_weights=True \
|
||||
--test_manifest='data/librispeech/manifest.test-clean' \
|
||||
--mean_std_path='data/librispeech/mean_std.npz' \
|
||||
--vocab_path='data/librispeech/eng_vocab.txt' \
|
||||
--model_path='checkpoints/params.latest.tar.gz' \
|
||||
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
|
||||
--decoding_method='ctc_beam_search' \
|
||||
--error_rate_type='wer' \
|
||||
--specgram_type='linear'
|
@ -0,0 +1,30 @@
|
||||
#! /usr/bin/bash
|
||||
|
||||
pushd ../..
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
python -u train.py \
|
||||
--batch_size=256 \
|
||||
--trainer_count=8 \
|
||||
--num_passes=200 \
|
||||
--num_proc_data=12 \
|
||||
--num_conv_layers=2 \
|
||||
--num_rnn_layers=3 \
|
||||
--rnn_layer_size=2048 \
|
||||
--num_iter_print=100 \
|
||||
--learning_rate=5e-4 \
|
||||
--max_duration=27.0 \
|
||||
--min_duration=0.0 \
|
||||
--use_sortagrad=True \
|
||||
--use_gru=False \
|
||||
--use_gpu=True \
|
||||
--is_local=True \
|
||||
--share_rnn_weights=True \
|
||||
--train_manifest='data/librispeech/manifest.train' \
|
||||
--dev_manifest='data/librispeech/manifest.dev' \
|
||||
--mean_std_path='data/librispeech/mean_std.npz' \
|
||||
--vocab_path='data/librispeech/eng_vocab.txt' \
|
||||
--output_model_dir='./checkpoints' \
|
||||
--augment_conf_path='conf/augmentation.config' \
|
||||
--specgram_type='linear' \
|
||||
--shuffle_method='batch_shuffle_clipped'
|
@ -0,0 +1,30 @@
|
||||
#! /usr/bin/bash
|
||||
|
||||
pushd ../..
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
python -u tools/tune.py \
|
||||
--num_samples=100 \
|
||||
--trainer_count=8 \
|
||||
--beam_size=500 \
|
||||
--num_proc_bsearch=12 \
|
||||
--num_conv_layers=2 \
|
||||
--num_rnn_layers=3 \
|
||||
--rnn_layer_size=2048 \
|
||||
--num_alphas=14 \
|
||||
--num_betas=20 \
|
||||
--alpha_from=0.1 \
|
||||
--alpha_to=0.36 \
|
||||
--beta_from=0.05 \
|
||||
--beta_to=1.0 \
|
||||
--cutoff_prob=0.99 \
|
||||
--use_gru=False \
|
||||
--use_gpu=True \
|
||||
--share_rnn_weights=True \
|
||||
--tune_manifest='data/librispeech/manifest.dev-clean' \
|
||||
--mean_std_path='data/librispeech/mean_std.npz' \
|
||||
--vocab_path='data/librispeech/eng_vocab.txt' \
|
||||
--model_path='checkpoints/params.latest.tar.gz' \
|
||||
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
|
||||
--error_rate_type='wer' \
|
||||
--specgram_type='linear'
|
@ -1,177 +0,0 @@
|
||||
"""Contains DeepSpeech2 layers."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle.v2 as paddle
|
||||
|
||||
|
||||
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
|
||||
padding, act):
|
||||
"""Convolution layer with batch normalization.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param filter_size: The x dimension of a filter kernel. Or input a tuple for
|
||||
two image dimension.
|
||||
:type filter_size: int|tuple|list
|
||||
:param num_channels_in: Number of input channels.
|
||||
:type num_channels_in: int
|
||||
:type num_channels_out: Number of output channels.
|
||||
:type num_channels_in: out
|
||||
:param padding: The x dimension of the padding. Or input a tuple for two
|
||||
image dimension.
|
||||
:type padding: int|tuple|list
|
||||
:param act: Activation type.
|
||||
:type act: BaseActivation
|
||||
:return: Batch norm layer after convolution layer.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
conv_layer = paddle.layer.img_conv(
|
||||
input=input,
|
||||
filter_size=filter_size,
|
||||
num_channels=num_channels_in,
|
||||
num_filters=num_channels_out,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
act=paddle.activation.Linear(),
|
||||
bias_attr=False)
|
||||
return paddle.layer.batch_norm(input=conv_layer, act=act)
|
||||
|
||||
|
||||
def bidirectional_simple_rnn_bn_layer(name, input, size, act):
|
||||
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
|
||||
The batch normalization is only performed on input-state weights.
|
||||
|
||||
:param name: Name of the layer.
|
||||
:type name: string
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param size: Number of RNN cells.
|
||||
:type size: int
|
||||
:param act: Activation type.
|
||||
:type act: BaseActivation
|
||||
:return: Bidirectional simple rnn layer.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
# input-hidden weights shared across bi-direcitonal rnn.
|
||||
input_proj = paddle.layer.fc(
|
||||
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
|
||||
# batch norm is only performed on input-state projection
|
||||
input_proj_bn = paddle.layer.batch_norm(
|
||||
input=input_proj, act=paddle.activation.Linear())
|
||||
# forward and backward in time
|
||||
forward_simple_rnn = paddle.layer.recurrent(
|
||||
input=input_proj_bn, act=act, reverse=False)
|
||||
backward_simple_rnn = paddle.layer.recurrent(
|
||||
input=input_proj_bn, act=act, reverse=True)
|
||||
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
|
||||
|
||||
|
||||
def conv_group(input, num_stacks):
|
||||
"""Convolution group with stacked convolution layers.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param num_stacks: Number of stacked convolution layers.
|
||||
:type num_stacks: int
|
||||
:return: Output layer of the convolution group.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
conv = conv_bn_layer(
|
||||
input=input,
|
||||
filter_size=(11, 41),
|
||||
num_channels_in=1,
|
||||
num_channels_out=32,
|
||||
stride=(3, 2),
|
||||
padding=(5, 20),
|
||||
act=paddle.activation.BRelu())
|
||||
for i in xrange(num_stacks - 1):
|
||||
conv = conv_bn_layer(
|
||||
input=conv,
|
||||
filter_size=(11, 21),
|
||||
num_channels_in=32,
|
||||
num_channels_out=32,
|
||||
stride=(1, 2),
|
||||
padding=(5, 10),
|
||||
act=paddle.activation.BRelu())
|
||||
output_num_channels = 32
|
||||
output_height = 160 // pow(2, num_stacks) + 1
|
||||
return conv, output_num_channels, output_height
|
||||
|
||||
|
||||
def rnn_group(input, size, num_stacks):
|
||||
"""RNN group with stacked bidirectional simple RNN layers.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param size: Number of RNN cells in each layer.
|
||||
:type size: int
|
||||
:param num_stacks: Number of stacked rnn layers.
|
||||
:type num_stacks: int
|
||||
:return: Output layer of the RNN group.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
output = input
|
||||
for i in xrange(num_stacks):
|
||||
output = bidirectional_simple_rnn_bn_layer(
|
||||
name=str(i), input=output, size=size, act=paddle.activation.BRelu())
|
||||
return output
|
||||
|
||||
|
||||
def deep_speech2(audio_data,
|
||||
text_data,
|
||||
dict_size,
|
||||
num_conv_layers=2,
|
||||
num_rnn_layers=3,
|
||||
rnn_size=256):
|
||||
"""
|
||||
The whole DeepSpeech2 model structure (a simplified version).
|
||||
|
||||
:param audio_data: Audio spectrogram data layer.
|
||||
:type audio_data: LayerOutput
|
||||
:param text_data: Transcription text data layer.
|
||||
:type text_data: LayerOutput
|
||||
:param dict_size: Dictionary size for tokenized transcription.
|
||||
:type dict_size: int
|
||||
:param num_conv_layers: Number of stacking convolution layers.
|
||||
:type num_conv_layers: int
|
||||
:param num_rnn_layers: Number of stacking RNN layers.
|
||||
:type num_rnn_layers: int
|
||||
:param rnn_size: RNN layer size (number of RNN cells).
|
||||
:type rnn_size: int
|
||||
:return: A tuple of an output unnormalized log probability layer (
|
||||
before softmax) and a ctc cost layer.
|
||||
:rtype: tuple of LayerOutput
|
||||
"""
|
||||
# convolution group
|
||||
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
|
||||
input=audio_data, num_stacks=num_conv_layers)
|
||||
# convert data form convolution feature map to sequence of vectors
|
||||
conv2seq = paddle.layer.block_expand(
|
||||
input=conv_group_output,
|
||||
num_channels=conv_group_num_channels,
|
||||
stride_x=1,
|
||||
stride_y=1,
|
||||
block_x=1,
|
||||
block_y=conv_group_height)
|
||||
# rnn group
|
||||
rnn_group_output = rnn_group(
|
||||
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
|
||||
fc = paddle.layer.fc(
|
||||
input=rnn_group_output,
|
||||
size=dict_size + 1,
|
||||
act=paddle.activation.Linear(),
|
||||
bias_attr=True)
|
||||
# probability distribution with softmax
|
||||
log_probs = paddle.layer.mixed(
|
||||
input=paddle.layer.identity_projection(input=fc),
|
||||
act=paddle.activation.Softmax())
|
||||
# ctc cost
|
||||
ctc_loss = paddle.layer.warp_ctc(
|
||||
input=fc,
|
||||
label=text_data,
|
||||
size=dict_size + 1,
|
||||
blank=dict_size,
|
||||
norm_by_times=True)
|
||||
return log_probs, ctc_loss
|
@ -0,0 +1,274 @@
|
||||
"""Contains DeepSpeech2 layers and networks."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle.v2 as paddle
|
||||
|
||||
|
||||
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
|
||||
padding, act):
|
||||
"""Convolution layer with batch normalization.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param filter_size: The x dimension of a filter kernel. Or input a tuple for
|
||||
two image dimension.
|
||||
:type filter_size: int|tuple|list
|
||||
:param num_channels_in: Number of input channels.
|
||||
:type num_channels_in: int
|
||||
:type num_channels_out: Number of output channels.
|
||||
:type num_channels_in: out
|
||||
:param padding: The x dimension of the padding. Or input a tuple for two
|
||||
image dimension.
|
||||
:type padding: int|tuple|list
|
||||
:param act: Activation type.
|
||||
:type act: BaseActivation
|
||||
:return: Batch norm layer after convolution layer.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
conv_layer = paddle.layer.img_conv(
|
||||
input=input,
|
||||
filter_size=filter_size,
|
||||
num_channels=num_channels_in,
|
||||
num_filters=num_channels_out,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
act=paddle.activation.Linear(),
|
||||
bias_attr=False)
|
||||
return paddle.layer.batch_norm(input=conv_layer, act=act)
|
||||
|
||||
|
||||
def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
|
||||
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
|
||||
The batch normalization is only performed on input-state weights.
|
||||
|
||||
:param name: Name of the layer.
|
||||
:type name: string
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param size: Number of RNN cells.
|
||||
:type size: int
|
||||
:param act: Activation type.
|
||||
:type act: BaseActivation
|
||||
:param share_weights: Whether to share input-hidden weights between
|
||||
forward and backward directional RNNs.
|
||||
:type share_weights: bool
|
||||
:return: Bidirectional simple rnn layer.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
if share_weights:
|
||||
# input-hidden weights shared between bi-direcitonal rnn.
|
||||
input_proj = paddle.layer.fc(
|
||||
input=input,
|
||||
size=size,
|
||||
act=paddle.activation.Linear(),
|
||||
bias_attr=False)
|
||||
# batch norm is only performed on input-state projection
|
||||
input_proj_bn = paddle.layer.batch_norm(
|
||||
input=input_proj, act=paddle.activation.Linear())
|
||||
# forward and backward in time
|
||||
forward_simple_rnn = paddle.layer.recurrent(
|
||||
input=input_proj_bn, act=act, reverse=False)
|
||||
backward_simple_rnn = paddle.layer.recurrent(
|
||||
input=input_proj_bn, act=act, reverse=True)
|
||||
|
||||
else:
|
||||
input_proj_forward = paddle.layer.fc(
|
||||
input=input,
|
||||
size=size,
|
||||
act=paddle.activation.Linear(),
|
||||
bias_attr=False)
|
||||
input_proj_backward = paddle.layer.fc(
|
||||
input=input,
|
||||
size=size,
|
||||
act=paddle.activation.Linear(),
|
||||
bias_attr=False)
|
||||
# batch norm is only performed on input-state projection
|
||||
input_proj_bn_forward = paddle.layer.batch_norm(
|
||||
input=input_proj_forward, act=paddle.activation.Linear())
|
||||
input_proj_bn_backward = paddle.layer.batch_norm(
|
||||
input=input_proj_backward, act=paddle.activation.Linear())
|
||||
# forward and backward in time
|
||||
forward_simple_rnn = paddle.layer.recurrent(
|
||||
input=input_proj_bn_forward, act=act, reverse=False)
|
||||
backward_simple_rnn = paddle.layer.recurrent(
|
||||
input=input_proj_bn_backward, act=act, reverse=True)
|
||||
|
||||
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
|
||||
|
||||
|
||||
def bidirectional_gru_bn_layer(name, input, size, act):
|
||||
"""Bidirectonal gru layer with sequence-wise batch normalization.
|
||||
The batch normalization is only performed on input-state weights.
|
||||
|
||||
:param name: Name of the layer.
|
||||
:type name: string
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param size: Number of RNN cells.
|
||||
:type size: int
|
||||
:param act: Activation type.
|
||||
:type act: BaseActivation
|
||||
:return: Bidirectional simple rnn layer.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
input_proj_forward = paddle.layer.fc(
|
||||
input=input,
|
||||
size=size * 3,
|
||||
act=paddle.activation.Linear(),
|
||||
bias_attr=False)
|
||||
input_proj_backward = paddle.layer.fc(
|
||||
input=input,
|
||||
size=size * 3,
|
||||
act=paddle.activation.Linear(),
|
||||
bias_attr=False)
|
||||
# batch norm is only performed on input-related projections
|
||||
input_proj_bn_forward = paddle.layer.batch_norm(
|
||||
input=input_proj_forward, act=paddle.activation.Linear())
|
||||
input_proj_bn_backward = paddle.layer.batch_norm(
|
||||
input=input_proj_backward, act=paddle.activation.Linear())
|
||||
# forward and backward in time
|
||||
forward_gru = paddle.layer.grumemory(
|
||||
input=input_proj_bn_forward, act=act, reverse=False)
|
||||
backward_gru = paddle.layer.grumemory(
|
||||
input=input_proj_bn_backward, act=act, reverse=True)
|
||||
return paddle.layer.concat(input=[forward_gru, backward_gru])
|
||||
|
||||
|
||||
def conv_group(input, num_stacks):
|
||||
"""Convolution group with stacked convolution layers.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param num_stacks: Number of stacked convolution layers.
|
||||
:type num_stacks: int
|
||||
:return: Output layer of the convolution group.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
conv = conv_bn_layer(
|
||||
input=input,
|
||||
filter_size=(11, 41),
|
||||
num_channels_in=1,
|
||||
num_channels_out=32,
|
||||
stride=(3, 2),
|
||||
padding=(5, 20),
|
||||
act=paddle.activation.BRelu())
|
||||
for i in xrange(num_stacks - 1):
|
||||
conv = conv_bn_layer(
|
||||
input=conv,
|
||||
filter_size=(11, 21),
|
||||
num_channels_in=32,
|
||||
num_channels_out=32,
|
||||
stride=(1, 2),
|
||||
padding=(5, 10),
|
||||
act=paddle.activation.BRelu())
|
||||
output_num_channels = 32
|
||||
output_height = 160 // pow(2, num_stacks) + 1
|
||||
return conv, output_num_channels, output_height
|
||||
|
||||
|
||||
def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
|
||||
"""RNN group with stacked bidirectional simple RNN layers.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param size: Number of RNN cells in each layer.
|
||||
:type size: int
|
||||
:param num_stacks: Number of stacked rnn layers.
|
||||
:type num_stacks: int
|
||||
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
||||
:type use_gru: bool
|
||||
:param share_rnn_weights: Whether to share input-hidden weights between
|
||||
forward and backward directional RNNs.
|
||||
It is only available when use_gru=False.
|
||||
:type share_weights: bool
|
||||
:return: Output layer of the RNN group.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
output = input
|
||||
for i in xrange(num_stacks):
|
||||
if use_gru:
|
||||
output = bidirectional_gru_bn_layer(
|
||||
name=str(i),
|
||||
input=output,
|
||||
size=size,
|
||||
act=paddle.activation.Relu())
|
||||
# BRelu does not support hppl, need to add later. Use Relu instead.
|
||||
else:
|
||||
output = bidirectional_simple_rnn_bn_layer(
|
||||
name=str(i),
|
||||
input=output,
|
||||
size=size,
|
||||
act=paddle.activation.BRelu(),
|
||||
share_weights=share_rnn_weights)
|
||||
return output
|
||||
|
||||
|
||||
def deep_speech_v2_network(audio_data,
|
||||
text_data,
|
||||
dict_size,
|
||||
num_conv_layers=2,
|
||||
num_rnn_layers=3,
|
||||
rnn_size=256,
|
||||
use_gru=False,
|
||||
share_rnn_weights=True):
|
||||
"""The DeepSpeech2 network structure.
|
||||
|
||||
:param audio_data: Audio spectrogram data layer.
|
||||
:type audio_data: LayerOutput
|
||||
:param text_data: Transcription text data layer.
|
||||
:type text_data: LayerOutput
|
||||
:param dict_size: Dictionary size for tokenized transcription.
|
||||
:type dict_size: int
|
||||
:param num_conv_layers: Number of stacking convolution layers.
|
||||
:type num_conv_layers: int
|
||||
:param num_rnn_layers: Number of stacking RNN layers.
|
||||
:type num_rnn_layers: int
|
||||
:param rnn_size: RNN layer size (number of RNN cells).
|
||||
:type rnn_size: int
|
||||
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
||||
:type use_gru: bool
|
||||
:param share_rnn_weights: Whether to share input-hidden weights between
|
||||
forward and backward direction RNNs.
|
||||
It is only available when use_gru=False.
|
||||
:type share_weights: bool
|
||||
:return: A tuple of an output unnormalized log probability layer (
|
||||
before softmax) and a ctc cost layer.
|
||||
:rtype: tuple of LayerOutput
|
||||
"""
|
||||
# convolution group
|
||||
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
|
||||
input=audio_data, num_stacks=num_conv_layers)
|
||||
# convert data form convolution feature map to sequence of vectors
|
||||
conv2seq = paddle.layer.block_expand(
|
||||
input=conv_group_output,
|
||||
num_channels=conv_group_num_channels,
|
||||
stride_x=1,
|
||||
stride_y=1,
|
||||
block_x=1,
|
||||
block_y=conv_group_height)
|
||||
# rnn group
|
||||
rnn_group_output = rnn_group(
|
||||
input=conv2seq,
|
||||
size=rnn_size,
|
||||
num_stacks=num_rnn_layers,
|
||||
use_gru=use_gru,
|
||||
share_rnn_weights=share_rnn_weights)
|
||||
fc = paddle.layer.fc(
|
||||
input=rnn_group_output,
|
||||
size=dict_size + 1,
|
||||
act=paddle.activation.Linear(),
|
||||
bias_attr=True)
|
||||
# probability distribution with softmax
|
||||
log_probs = paddle.layer.mixed(
|
||||
input=paddle.layer.identity_projection(input=fc),
|
||||
act=paddle.activation.Softmax())
|
||||
# ctc cost
|
||||
ctc_loss = paddle.layer.warp_ctc(
|
||||
input=fc,
|
||||
label=text_data,
|
||||
size=dict_size + 1,
|
||||
blank=dict_size,
|
||||
norm_by_times=True)
|
||||
return log_probs, ctc_loss
|
@ -0,0 +1,121 @@
|
||||
"""Evaluation for DeepSpeech2 model."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import functools
|
||||
import paddle.v2 as paddle
|
||||
from data_utils.data import DataGenerator
|
||||
from models.model import DeepSpeech2Model
|
||||
from utils.error_rate import wer, cer
|
||||
from utils.utility import add_arguments, print_arguments
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
add_arg = functools.partial(add_arguments, argparser=parser)
|
||||
# yapf: disable
|
||||
add_arg('batch_size', int, 128, "Minibatch size.")
|
||||
add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
|
||||
add_arg('beam_size', int, 500, "Beam search width.")
|
||||
add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.")
|
||||
add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.")
|
||||
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
|
||||
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
|
||||
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
|
||||
add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
|
||||
add_arg('beta', float, 0.25, "Coef of WC for beam search.")
|
||||
add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
|
||||
add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
|
||||
add_arg('use_gpu', bool, True, "Use GPU or not.")
|
||||
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
|
||||
"bi-directional RNNs. Not for GRU.")
|
||||
add_arg('test_manifest', str,
|
||||
'data/librispeech/manifest.test-clean',
|
||||
"Filepath of manifest to evaluate.")
|
||||
add_arg('mean_std_path', str,
|
||||
'data/librispeech/mean_std.npz',
|
||||
"Filepath of normalizer's mean & std.")
|
||||
add_arg('vocab_path', str,
|
||||
'data/librispeech/eng_vocab.txt',
|
||||
"Filepath of vocabulary.")
|
||||
add_arg('model_path', str,
|
||||
'./checkpoints/params.latest.tar.gz',
|
||||
"If None, the training starts from scratch, "
|
||||
"otherwise, it resumes from the pre-trained model.")
|
||||
add_arg('lang_model_path', str,
|
||||
'lm/data/common_crawl_00.prune01111.trie.klm',
|
||||
"Filepath for language model.")
|
||||
add_arg('decoding_method', str,
|
||||
'ctc_beam_search',
|
||||
"Decoding method. Options: ctc_beam_search, ctc_greedy",
|
||||
choices = ['ctc_beam_search', 'ctc_greedy'])
|
||||
add_arg('error_rate_type', str,
|
||||
'wer',
|
||||
"Error rate type for evaluation.",
|
||||
choices=['wer', 'cer'])
|
||||
add_arg('specgram_type', str,
|
||||
'linear',
|
||||
"Audio feature type. Options: linear, mfcc.",
|
||||
choices=['linear', 'mfcc'])
|
||||
# yapf: disable
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def evaluate():
|
||||
"""Evaluate on whole test data for DeepSpeech2."""
|
||||
data_generator = DataGenerator(
|
||||
vocab_filepath=args.vocab_path,
|
||||
mean_std_filepath=args.mean_std_path,
|
||||
augmentation_config='{}',
|
||||
specgram_type=args.specgram_type,
|
||||
num_threads=args.num_proc_data)
|
||||
batch_reader = data_generator.batch_reader_creator(
|
||||
manifest_path=args.test_manifest,
|
||||
batch_size=args.batch_size,
|
||||
min_batch_size=1,
|
||||
sortagrad=False,
|
||||
shuffle_method=None)
|
||||
|
||||
ds2_model = DeepSpeech2Model(
|
||||
vocab_size=data_generator.vocab_size,
|
||||
num_conv_layers=args.num_conv_layers,
|
||||
num_rnn_layers=args.num_rnn_layers,
|
||||
rnn_layer_size=args.rnn_layer_size,
|
||||
use_gru=args.use_gru,
|
||||
pretrained_model_path=args.model_path,
|
||||
share_rnn_weights=args.share_rnn_weights)
|
||||
|
||||
error_rate_func = cer if args.error_rate_type == 'cer' else wer
|
||||
error_sum, num_ins = 0.0, 0
|
||||
for infer_data in batch_reader():
|
||||
result_transcripts = ds2_model.infer_batch(
|
||||
infer_data=infer_data,
|
||||
decoding_method=args.decoding_method,
|
||||
beam_alpha=args.alpha,
|
||||
beam_beta=args.beta,
|
||||
beam_size=args.beam_size,
|
||||
cutoff_prob=args.cutoff_prob,
|
||||
vocab_list=data_generator.vocab_list,
|
||||
language_model_path=args.lang_model_path,
|
||||
num_processes=args.num_proc_bsearch)
|
||||
target_transcripts = [
|
||||
''.join([data_generator.vocab_list[token] for token in transcript])
|
||||
for _, transcript in infer_data
|
||||
]
|
||||
for target, result in zip(target_transcripts, result_transcripts):
|
||||
error_sum += error_rate_func(target, result)
|
||||
num_ins += 1
|
||||
print("Error rate [%s] (%d/?) = %f" %
|
||||
(args.error_rate_type, num_ins, error_sum / num_ins))
|
||||
print("Final error rate [%s] (%d/%d) = %f" %
|
||||
(args.error_rate_type, num_ins, num_ins, error_sum / num_ins))
|
||||
|
||||
|
||||
def main():
|
||||
print_arguments(args)
|
||||
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
|
||||
evaluate()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,23 +0,0 @@
|
||||
"""Test Setup."""
|
||||
import unittest
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
|
||||
class TestSetup(unittest.TestCase):
|
||||
def test_soundfile(self):
|
||||
import soundfile as sf
|
||||
# floating point data is typically limited to the interval [-1.0, 1.0],
|
||||
# but smaller/larger values are supported as well
|
||||
data = np.array([[1.75, -1.75], [1.0, -1.0], [0.5, -0.5],
|
||||
[0.25, -0.25]])
|
||||
file = 'test.wav'
|
||||
sf.write(file, data, 44100, format='WAV', subtype='FLOAT')
|
||||
read, fs = sf.read(file)
|
||||
self.assertTrue(np.all(read == data))
|
||||
self.assertEqual(fs, 44100)
|
||||
os.remove(file)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
@ -0,0 +1,131 @@
|
||||
"""Beam search parameters tuning for DeepSpeech2 model."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import argparse
|
||||
import functools
|
||||
import paddle.v2 as paddle
|
||||
import _init_paths
|
||||
from data_utils.data import DataGenerator
|
||||
from models.model import DeepSpeech2Model
|
||||
from utils.error_rate import wer
|
||||
from utils.utility import add_arguments, print_arguments
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
add_arg = functools.partial(add_arguments, argparser=parser)
|
||||
# yapf: disable
|
||||
add_arg('num_samples', int, 100, "# of samples to infer.")
|
||||
add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
|
||||
add_arg('beam_size', int, 500, "Beam search width.")
|
||||
add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.")
|
||||
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
|
||||
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
|
||||
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
|
||||
add_arg('num_alphas', int, 14, "# of alpha candidates for tuning.")
|
||||
add_arg('num_betas', int, 20, "# of beta candidates for tuning.")
|
||||
add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.")
|
||||
add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.")
|
||||
add_arg('beta_from', float, 0.05, "Where beta starts tuning from.")
|
||||
add_arg('beta_to', float, 1.0, "Where beta ends tuning with.")
|
||||
add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
|
||||
add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
|
||||
add_arg('use_gpu', bool, True, "Use GPU or not.")
|
||||
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
|
||||
"bi-directional RNNs. Not for GRU.")
|
||||
add_arg('tune_manifest', str,
|
||||
'data/librispeech/manifest.dev',
|
||||
"Filepath of manifest to tune.")
|
||||
add_arg('mean_std_path', str,
|
||||
'data/librispeech/mean_std.npz',
|
||||
"Filepath of normalizer's mean & std.")
|
||||
add_arg('vocab_path', str,
|
||||
'data/librispeech/eng_vocab.txt',
|
||||
"Filepath of vocabulary.")
|
||||
add_arg('lang_model_path', str,
|
||||
'lm/data/common_crawl_00.prune01111.trie.klm',
|
||||
"Filepath for language model.")
|
||||
add_arg('model_path', str,
|
||||
'./checkpoints/params.latest.tar.gz',
|
||||
"If None, the training starts from scratch, "
|
||||
"otherwise, it resumes from the pre-trained model.")
|
||||
add_arg('error_rate_type', str,
|
||||
'wer',
|
||||
"Error rate type for evaluation.",
|
||||
choices=['wer', 'cer'])
|
||||
add_arg('specgram_type', str,
|
||||
'linear',
|
||||
"Audio feature type. Options: linear, mfcc.",
|
||||
choices=['linear', 'mfcc'])
|
||||
# yapf: disable
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def tune():
|
||||
"""Tune parameters alpha and beta on one minibatch."""
|
||||
if not args.num_alphas >= 0:
|
||||
raise ValueError("num_alphas must be non-negative!")
|
||||
if not args.num_betas >= 0:
|
||||
raise ValueError("num_betas must be non-negative!")
|
||||
|
||||
data_generator = DataGenerator(
|
||||
vocab_filepath=args.vocab_path,
|
||||
mean_std_filepath=args.mean_std_path,
|
||||
augmentation_config='{}',
|
||||
specgram_type=args.specgram_type,
|
||||
num_threads=1)
|
||||
batch_reader = data_generator.batch_reader_creator(
|
||||
manifest_path=args.tune_manifest,
|
||||
batch_size=args.num_samples,
|
||||
sortagrad=False,
|
||||
shuffle_method=None)
|
||||
tune_data = batch_reader().next()
|
||||
target_transcripts = [
|
||||
''.join([data_generator.vocab_list[token] for token in transcript])
|
||||
for _, transcript in tune_data
|
||||
]
|
||||
|
||||
ds2_model = DeepSpeech2Model(
|
||||
vocab_size=data_generator.vocab_size,
|
||||
num_conv_layers=args.num_conv_layers,
|
||||
num_rnn_layers=args.num_rnn_layers,
|
||||
rnn_layer_size=args.rnn_layer_size,
|
||||
use_gru=args.use_gru,
|
||||
pretrained_model_path=args.model_path,
|
||||
share_rnn_weights=args.share_rnn_weights)
|
||||
|
||||
# create grid for search
|
||||
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
|
||||
cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
|
||||
params_grid = [(alpha, beta) for alpha in cand_alphas
|
||||
for beta in cand_betas]
|
||||
|
||||
## tune parameters in loop
|
||||
for alpha, beta in params_grid:
|
||||
result_transcripts = ds2_model.infer_batch(
|
||||
infer_data=tune_data,
|
||||
decoding_method='ctc_beam_search',
|
||||
beam_alpha=alpha,
|
||||
beam_beta=beta,
|
||||
beam_size=args.beam_size,
|
||||
cutoff_prob=args.cutoff_prob,
|
||||
vocab_list=data_generator.vocab_list,
|
||||
language_model_path=args.lang_model_path,
|
||||
num_processes=args.num_proc_bsearch)
|
||||
wer_sum, num_ins = 0.0, 0
|
||||
for target, result in zip(target_transcripts, result_transcripts):
|
||||
wer_sum += wer(target, result)
|
||||
num_ins += 1
|
||||
print("alpha = %f\tbeta = %f\tWER = %f" %
|
||||
(alpha, beta, wer_sum / num_ins))
|
||||
|
||||
|
||||
def main():
|
||||
print_arguments(args)
|
||||
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
|
||||
tune()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,196 +0,0 @@
|
||||
"""Parameters tuning for DeepSpeech2 model."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import distutils.util
|
||||
import argparse
|
||||
import multiprocessing
|
||||
import paddle.v2 as paddle
|
||||
from data_utils.data import DataGenerator
|
||||
from model import DeepSpeech2Model
|
||||
from error_rate import wer
|
||||
import utils
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--num_samples",
|
||||
default=100,
|
||||
type=int,
|
||||
help="Number of samples for parameters tuning. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--num_conv_layers",
|
||||
default=2,
|
||||
type=int,
|
||||
help="Convolution layer number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--num_rnn_layers",
|
||||
default=3,
|
||||
type=int,
|
||||
help="RNN layer number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--rnn_layer_size",
|
||||
default=512,
|
||||
type=int,
|
||||
help="RNN layer cell number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--use_gpu",
|
||||
default=True,
|
||||
type=distutils.util.strtobool,
|
||||
help="Use gpu or not. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--trainer_count",
|
||||
default=8,
|
||||
type=int,
|
||||
help="Trainer number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--num_threads_data",
|
||||
default=1,
|
||||
type=int,
|
||||
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--num_processes_beam_search",
|
||||
default=multiprocessing.cpu_count() // 2,
|
||||
type=int,
|
||||
help="Number of cpu processes for beam search. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--specgram_type",
|
||||
default='linear',
|
||||
type=str,
|
||||
help="Feature type of audio data: 'linear' (power spectrum)"
|
||||
" or 'mfcc'. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--mean_std_filepath",
|
||||
default='mean_std.npz',
|
||||
type=str,
|
||||
help="Manifest path for normalizer. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--tune_manifest_path",
|
||||
default='datasets/manifest.dev',
|
||||
type=str,
|
||||
help="Manifest path for tuning. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--model_filepath",
|
||||
default='checkpoints/params.latest.tar.gz',
|
||||
type=str,
|
||||
help="Model filepath. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--vocab_filepath",
|
||||
default='datasets/vocab/eng_vocab.txt',
|
||||
type=str,
|
||||
help="Vocabulary filepath. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--beam_size",
|
||||
default=500,
|
||||
type=int,
|
||||
help="Width for beam search decoding. (default: %(default)d)")
|
||||
parser.add_argument(
|
||||
"--language_model_path",
|
||||
default="lm/data/common_crawl_00.prune01111.trie.klm",
|
||||
type=str,
|
||||
help="Path for language model. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--alpha_from",
|
||||
default=0.1,
|
||||
type=float,
|
||||
help="Where alpha starts from. (default: %(default)f)")
|
||||
parser.add_argument(
|
||||
"--num_alphas",
|
||||
default=14,
|
||||
type=int,
|
||||
help="Number of candidate alphas. (default: %(default)d)")
|
||||
parser.add_argument(
|
||||
"--alpha_to",
|
||||
default=0.36,
|
||||
type=float,
|
||||
help="Where alpha ends with. (default: %(default)f)")
|
||||
parser.add_argument(
|
||||
"--beta_from",
|
||||
default=0.05,
|
||||
type=float,
|
||||
help="Where beta starts from. (default: %(default)f)")
|
||||
parser.add_argument(
|
||||
"--num_betas",
|
||||
default=20,
|
||||
type=float,
|
||||
help="Number of candidate betas. (default: %(default)d)")
|
||||
parser.add_argument(
|
||||
"--beta_to",
|
||||
default=1.0,
|
||||
type=float,
|
||||
help="Where beta ends with. (default: %(default)f)")
|
||||
parser.add_argument(
|
||||
"--cutoff_prob",
|
||||
default=0.99,
|
||||
type=float,
|
||||
help="The cutoff probability of pruning"
|
||||
"in beam search. (default: %(default)f)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def tune():
|
||||
"""Tune parameters alpha and beta on one minibatch."""
|
||||
if not args.num_alphas >= 0:
|
||||
raise ValueError("num_alphas must be non-negative!")
|
||||
if not args.num_betas >= 0:
|
||||
raise ValueError("num_betas must be non-negative!")
|
||||
|
||||
data_generator = DataGenerator(
|
||||
vocab_filepath=args.vocab_filepath,
|
||||
mean_std_filepath=args.mean_std_filepath,
|
||||
augmentation_config='{}',
|
||||
specgram_type=args.specgram_type,
|
||||
num_threads=args.num_threads_data)
|
||||
batch_reader = data_generator.batch_reader_creator(
|
||||
manifest_path=args.tune_manifest_path,
|
||||
batch_size=args.num_samples,
|
||||
sortagrad=False,
|
||||
shuffle_method=None)
|
||||
tune_data = batch_reader().next()
|
||||
target_transcripts = [
|
||||
''.join([data_generator.vocab_list[token] for token in transcript])
|
||||
for _, transcript in tune_data
|
||||
]
|
||||
|
||||
ds2_model = DeepSpeech2Model(
|
||||
vocab_size=data_generator.vocab_size,
|
||||
num_conv_layers=args.num_conv_layers,
|
||||
num_rnn_layers=args.num_rnn_layers,
|
||||
rnn_layer_size=args.rnn_layer_size,
|
||||
pretrained_model_path=args.model_filepath)
|
||||
|
||||
# create grid for search
|
||||
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
|
||||
cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
|
||||
params_grid = [(alpha, beta) for alpha in cand_alphas
|
||||
for beta in cand_betas]
|
||||
|
||||
## tune parameters in loop
|
||||
for alpha, beta in params_grid:
|
||||
result_transcripts = ds2_model.infer_batch(
|
||||
infer_data=tune_data,
|
||||
decode_method='beam_search',
|
||||
beam_alpha=alpha,
|
||||
beam_beta=beta,
|
||||
beam_size=args.beam_size,
|
||||
cutoff_prob=args.cutoff_prob,
|
||||
vocab_list=data_generator.vocab_list,
|
||||
language_model_path=args.language_model_path,
|
||||
num_processes=args.num_processes_beam_search)
|
||||
wer_sum, num_ins = 0.0, 0
|
||||
for target, result in zip(target_transcripts, result_transcripts):
|
||||
wer_sum += wer(target, result)
|
||||
num_ins += 1
|
||||
print("alpha = %f\tbeta = %f\tWER = %f" %
|
||||
(alpha, beta, wer_sum / num_ins))
|
||||
|
||||
|
||||
def main():
|
||||
utils.print_arguments(args)
|
||||
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
|
||||
tune()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,25 +0,0 @@
|
||||
"""Contains common utility functions."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
def print_arguments(args):
|
||||
"""Print argparse's arguments.
|
||||
|
||||
Usage:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("name", default="Jonh", type=str, help="User name.")
|
||||
args = parser.parse_args()
|
||||
print_arguments(args)
|
||||
|
||||
:param args: Input argparse.Namespace for printing.
|
||||
:type args: argparse.Namespace
|
||||
"""
|
||||
print("----- Configuration Arguments -----")
|
||||
for arg, value in vars(args).iteritems():
|
||||
print("%s: %s" % (arg, value))
|
||||
print("------------------------------------")
|
@ -0,0 +1,47 @@
|
||||
"""Contains common utility functions."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import distutils.util
|
||||
|
||||
|
||||
def print_arguments(args):
|
||||
"""Print argparse's arguments.
|
||||
|
||||
Usage:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("name", default="Jonh", type=str, help="User name.")
|
||||
args = parser.parse_args()
|
||||
print_arguments(args)
|
||||
|
||||
:param args: Input argparse.Namespace for printing.
|
||||
:type args: argparse.Namespace
|
||||
"""
|
||||
print("----------- Configuration Arguments -----------")
|
||||
for arg, value in sorted(vars(args).iteritems()):
|
||||
print("%s: %s" % (arg, value))
|
||||
print("------------------------------------------------")
|
||||
|
||||
|
||||
def add_arguments(argname, type, default, help, argparser, **kwargs):
|
||||
"""Add argparse's argument.
|
||||
|
||||
Usage:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
add_argument("name", str, "Jonh", "User name.", parser)
|
||||
args = parser.parse_args()
|
||||
"""
|
||||
type = distutils.util.strtobool if type == bool else type
|
||||
argparser.add_argument(
|
||||
"--" + argname,
|
||||
default=default,
|
||||
type=type,
|
||||
help=help + ' Default: %(default)s.',
|
||||
**kwargs)
|
Loading…
Reference in new issue