Merge branch 'develop' of https://github.com/PaddlePaddle/models into ctc_decoder_deploy
commit
11ede80a48
@ -1,13 +0,0 @@
|
|||||||
cd librispeech
|
|
||||||
python librispeech.py
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
echo "Prepare LibriSpeech failed. Terminated."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
cd -
|
|
||||||
|
|
||||||
cat librispeech/manifest.train* | shuf > manifest.train
|
|
||||||
cat librispeech/manifest.dev-clean > manifest.dev
|
|
||||||
cat librispeech/manifest.test-clean > manifest.test
|
|
||||||
|
|
||||||
echo "All done."
|
|
@ -1,10 +0,0 @@
|
|||||||
cd noise
|
|
||||||
python chime3_background.py
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
echo "Prepare CHiME3 background noise failed. Terminated."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
cd -
|
|
||||||
|
|
||||||
cat noise/manifest.* > manifest.noise
|
|
||||||
echo "All done."
|
|
@ -0,0 +1,19 @@
|
|||||||
|
"""Set up paths for DS2"""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import os.path
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def add_path(path):
|
||||||
|
if path not in sys.path:
|
||||||
|
sys.path.insert(0, path)
|
||||||
|
|
||||||
|
|
||||||
|
this_dir = os.path.dirname(__file__)
|
||||||
|
|
||||||
|
# Add project path to PYTHONPATH
|
||||||
|
proj_path = os.path.join(this_dir, '..')
|
||||||
|
add_path(proj_path)
|
@ -1,180 +0,0 @@
|
|||||||
"""Evaluation for DeepSpeech2 model."""
|
|
||||||
from __future__ import absolute_import
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import distutils.util
|
|
||||||
import argparse
|
|
||||||
import multiprocessing
|
|
||||||
import paddle.v2 as paddle
|
|
||||||
from data_utils.data import DataGenerator
|
|
||||||
from model import DeepSpeech2Model
|
|
||||||
from error_rate import wer, cer
|
|
||||||
import utils
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description=__doc__)
|
|
||||||
parser.add_argument(
|
|
||||||
"--batch_size",
|
|
||||||
default=128,
|
|
||||||
type=int,
|
|
||||||
help="Minibatch size for evaluation. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--trainer_count",
|
|
||||||
default=8,
|
|
||||||
type=int,
|
|
||||||
help="Trainer number. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--num_conv_layers",
|
|
||||||
default=2,
|
|
||||||
type=int,
|
|
||||||
help="Convolution layer number. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--num_rnn_layers",
|
|
||||||
default=3,
|
|
||||||
type=int,
|
|
||||||
help="RNN layer number. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--rnn_layer_size",
|
|
||||||
default=512,
|
|
||||||
type=int,
|
|
||||||
help="RNN layer cell number. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--use_gpu",
|
|
||||||
default=True,
|
|
||||||
type=distutils.util.strtobool,
|
|
||||||
help="Use gpu or not. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--num_threads_data",
|
|
||||||
default=multiprocessing.cpu_count() // 2,
|
|
||||||
type=int,
|
|
||||||
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--num_processes_beam_search",
|
|
||||||
default=multiprocessing.cpu_count() // 2,
|
|
||||||
type=int,
|
|
||||||
help="Number of cpu processes for beam search. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--mean_std_filepath",
|
|
||||||
default='mean_std.npz',
|
|
||||||
type=str,
|
|
||||||
help="Manifest path for normalizer. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--decode_method",
|
|
||||||
default='beam_search',
|
|
||||||
type=str,
|
|
||||||
help="Method for ctc decoding, best_path or beam_search. "
|
|
||||||
"(default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--language_model_path",
|
|
||||||
default="lm/data/common_crawl_00.prune01111.trie.klm",
|
|
||||||
type=str,
|
|
||||||
help="Path for language model. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--alpha",
|
|
||||||
default=0.36,
|
|
||||||
type=float,
|
|
||||||
help="Parameter associated with language model. (default: %(default)f)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--beta",
|
|
||||||
default=0.25,
|
|
||||||
type=float,
|
|
||||||
help="Parameter associated with word count. (default: %(default)f)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--cutoff_prob",
|
|
||||||
default=0.99,
|
|
||||||
type=float,
|
|
||||||
help="The cutoff probability of pruning"
|
|
||||||
"in beam search. (default: %(default)f)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--beam_size",
|
|
||||||
default=500,
|
|
||||||
type=int,
|
|
||||||
help="Width for beam search decoding. (default: %(default)d)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--specgram_type",
|
|
||||||
default='linear',
|
|
||||||
type=str,
|
|
||||||
help="Feature type of audio data: 'linear' (power spectrum)"
|
|
||||||
" or 'mfcc'. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--decode_manifest_path",
|
|
||||||
default='datasets/manifest.test',
|
|
||||||
type=str,
|
|
||||||
help="Manifest path for decoding. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--model_filepath",
|
|
||||||
default='checkpoints/params.latest.tar.gz',
|
|
||||||
type=str,
|
|
||||||
help="Model filepath. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab_filepath",
|
|
||||||
default='datasets/vocab/eng_vocab.txt',
|
|
||||||
type=str,
|
|
||||||
help="Vocabulary filepath. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--error_rate_type",
|
|
||||||
default='wer',
|
|
||||||
choices=['wer', 'cer'],
|
|
||||||
type=str,
|
|
||||||
help="Error rate type for evaluation. 'wer' for word error rate and 'cer' "
|
|
||||||
"for character error rate. "
|
|
||||||
"(default: %(default)s)")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate():
|
|
||||||
"""Evaluate on whole test data for DeepSpeech2."""
|
|
||||||
data_generator = DataGenerator(
|
|
||||||
vocab_filepath=args.vocab_filepath,
|
|
||||||
mean_std_filepath=args.mean_std_filepath,
|
|
||||||
augmentation_config='{}',
|
|
||||||
specgram_type=args.specgram_type,
|
|
||||||
num_threads=args.num_threads_data)
|
|
||||||
batch_reader = data_generator.batch_reader_creator(
|
|
||||||
manifest_path=args.decode_manifest_path,
|
|
||||||
batch_size=args.batch_size,
|
|
||||||
min_batch_size=1,
|
|
||||||
sortagrad=False,
|
|
||||||
shuffle_method=None)
|
|
||||||
|
|
||||||
ds2_model = DeepSpeech2Model(
|
|
||||||
vocab_size=data_generator.vocab_size,
|
|
||||||
num_conv_layers=args.num_conv_layers,
|
|
||||||
num_rnn_layers=args.num_rnn_layers,
|
|
||||||
rnn_layer_size=args.rnn_layer_size,
|
|
||||||
pretrained_model_path=args.model_filepath)
|
|
||||||
|
|
||||||
error_rate_func = cer if args.error_rate_type == 'cer' else wer
|
|
||||||
error_sum, num_ins = 0.0, 0
|
|
||||||
for infer_data in batch_reader():
|
|
||||||
result_transcripts = ds2_model.infer_batch(
|
|
||||||
infer_data=infer_data,
|
|
||||||
decode_method=args.decode_method,
|
|
||||||
beam_alpha=args.alpha,
|
|
||||||
beam_beta=args.beta,
|
|
||||||
beam_size=args.beam_size,
|
|
||||||
cutoff_prob=args.cutoff_prob,
|
|
||||||
vocab_list=data_generator.vocab_list,
|
|
||||||
language_model_path=args.language_model_path,
|
|
||||||
num_processes=args.num_processes_beam_search)
|
|
||||||
target_transcripts = [
|
|
||||||
''.join([data_generator.vocab_list[token] for token in transcript])
|
|
||||||
for _, transcript in infer_data
|
|
||||||
]
|
|
||||||
for target, result in zip(target_transcripts, result_transcripts):
|
|
||||||
error_sum += error_rate_func(target, result)
|
|
||||||
num_ins += 1
|
|
||||||
print("Error rate [%s] (%d/?) = %f" %
|
|
||||||
(args.error_rate_type, num_ins, error_sum / num_ins))
|
|
||||||
print("Final error rate [%s] (%d/%d) = %f" %
|
|
||||||
(args.error_rate_type, num_ins, num_ins, error_sum / num_ins))
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
utils.print_arguments(args)
|
|
||||||
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
|
|
||||||
evaluate()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
@ -0,0 +1,28 @@
|
|||||||
|
#! /usr/bin/bash
|
||||||
|
|
||||||
|
pushd ../..
|
||||||
|
|
||||||
|
CUDA_VISIBLE_DEVICES=0 \
|
||||||
|
python -u infer.py \
|
||||||
|
--num_samples=10 \
|
||||||
|
--trainer_count=1 \
|
||||||
|
--beam_size=500 \
|
||||||
|
--num_proc_bsearch=12 \
|
||||||
|
--num_proc_data=12 \
|
||||||
|
--num_conv_layers=2 \
|
||||||
|
--num_rnn_layers=3 \
|
||||||
|
--rnn_layer_size=2048 \
|
||||||
|
--alpha=0.36 \
|
||||||
|
--beta=0.25 \
|
||||||
|
--cutoff_prob=0.99 \
|
||||||
|
--use_gru=False \
|
||||||
|
--use_gpu=True \
|
||||||
|
--share_rnn_weights=True \
|
||||||
|
--infer_manifest='data/librispeech/manifest.dev-clean' \
|
||||||
|
--mean_std_path='data/librispeech/mean_std.npz' \
|
||||||
|
--vocab_path='data/librispeech/eng_vocab.txt' \
|
||||||
|
--model_path='checkpoints/params.latest.tar.gz' \
|
||||||
|
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
|
||||||
|
--decoding_method='ctc_beam_search' \
|
||||||
|
--error_rate_type='wer' \
|
||||||
|
--specgram_type='linear'
|
@ -0,0 +1,32 @@
|
|||||||
|
#! /usr/bin/bash
|
||||||
|
|
||||||
|
pushd ../..
|
||||||
|
|
||||||
|
# download data, generate manifests
|
||||||
|
python data/librispeech/librispeech.py \
|
||||||
|
--manifest_prefix='data/librispeech/manifest' \
|
||||||
|
--full_download='True' \
|
||||||
|
--target_dir='~/.cache/paddle/dataset/speech/Libri'
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Prepare LibriSpeech failed. Terminated."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
#cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
|
||||||
|
|
||||||
|
|
||||||
|
# compute mean and stddev for normalizer
|
||||||
|
python tools/compute_mean_std.py \
|
||||||
|
--manifest_path='data/librispeech/manifest.train' \
|
||||||
|
--num_samples=2000 \
|
||||||
|
--specgram_type='linear' \
|
||||||
|
--output_path='data/librispeech/mean_std.npz'
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Compute mean and stddev failed. Terminated."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
echo "LibriSpeech Data preparation done."
|
@ -0,0 +1,28 @@
|
|||||||
|
#! /usr/bin/bash
|
||||||
|
|
||||||
|
pushd ../..
|
||||||
|
|
||||||
|
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||||
|
python -u evaluate.py \
|
||||||
|
--batch_size=128 \
|
||||||
|
--trainer_count=8 \
|
||||||
|
--beam_size=500 \
|
||||||
|
--num_proc_bsearch=12 \
|
||||||
|
--num_proc_data=12 \
|
||||||
|
--num_conv_layers=2 \
|
||||||
|
--num_rnn_layers=3 \
|
||||||
|
--rnn_layer_size=2048 \
|
||||||
|
--alpha=0.36 \
|
||||||
|
--beta=0.25 \
|
||||||
|
--cutoff_prob=0.99 \
|
||||||
|
--use_gru=False \
|
||||||
|
--use_gpu=True \
|
||||||
|
--share_rnn_weights=True \
|
||||||
|
--test_manifest='data/librispeech/manifest.test-clean' \
|
||||||
|
--mean_std_path='data/librispeech/mean_std.npz' \
|
||||||
|
--vocab_path='data/librispeech/eng_vocab.txt' \
|
||||||
|
--model_path='checkpoints/params.latest.tar.gz' \
|
||||||
|
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
|
||||||
|
--decoding_method='ctc_beam_search' \
|
||||||
|
--error_rate_type='wer' \
|
||||||
|
--specgram_type='linear'
|
@ -0,0 +1,30 @@
|
|||||||
|
#! /usr/bin/bash
|
||||||
|
|
||||||
|
pushd ../..
|
||||||
|
|
||||||
|
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||||
|
python -u train.py \
|
||||||
|
--batch_size=256 \
|
||||||
|
--trainer_count=8 \
|
||||||
|
--num_passes=200 \
|
||||||
|
--num_proc_data=12 \
|
||||||
|
--num_conv_layers=2 \
|
||||||
|
--num_rnn_layers=3 \
|
||||||
|
--rnn_layer_size=2048 \
|
||||||
|
--num_iter_print=100 \
|
||||||
|
--learning_rate=5e-4 \
|
||||||
|
--max_duration=27.0 \
|
||||||
|
--min_duration=0.0 \
|
||||||
|
--use_sortagrad=True \
|
||||||
|
--use_gru=False \
|
||||||
|
--use_gpu=True \
|
||||||
|
--is_local=True \
|
||||||
|
--share_rnn_weights=True \
|
||||||
|
--train_manifest='data/librispeech/manifest.train' \
|
||||||
|
--dev_manifest='data/librispeech/manifest.dev' \
|
||||||
|
--mean_std_path='data/librispeech/mean_std.npz' \
|
||||||
|
--vocab_path='data/librispeech/eng_vocab.txt' \
|
||||||
|
--output_model_dir='./checkpoints' \
|
||||||
|
--augment_conf_path='conf/augmentation.config' \
|
||||||
|
--specgram_type='linear' \
|
||||||
|
--shuffle_method='batch_shuffle_clipped'
|
@ -0,0 +1,30 @@
|
|||||||
|
#! /usr/bin/bash
|
||||||
|
|
||||||
|
pushd ../..
|
||||||
|
|
||||||
|
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||||
|
python -u tools/tune.py \
|
||||||
|
--num_samples=100 \
|
||||||
|
--trainer_count=8 \
|
||||||
|
--beam_size=500 \
|
||||||
|
--num_proc_bsearch=12 \
|
||||||
|
--num_conv_layers=2 \
|
||||||
|
--num_rnn_layers=3 \
|
||||||
|
--rnn_layer_size=2048 \
|
||||||
|
--num_alphas=14 \
|
||||||
|
--num_betas=20 \
|
||||||
|
--alpha_from=0.1 \
|
||||||
|
--alpha_to=0.36 \
|
||||||
|
--beta_from=0.05 \
|
||||||
|
--beta_to=1.0 \
|
||||||
|
--cutoff_prob=0.99 \
|
||||||
|
--use_gru=False \
|
||||||
|
--use_gpu=True \
|
||||||
|
--share_rnn_weights=True \
|
||||||
|
--tune_manifest='data/librispeech/manifest.dev-clean' \
|
||||||
|
--mean_std_path='data/librispeech/mean_std.npz' \
|
||||||
|
--vocab_path='data/librispeech/eng_vocab.txt' \
|
||||||
|
--model_path='checkpoints/params.latest.tar.gz' \
|
||||||
|
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
|
||||||
|
--error_rate_type='wer' \
|
||||||
|
--specgram_type='linear'
|
@ -1,177 +0,0 @@
|
|||||||
"""Contains DeepSpeech2 layers."""
|
|
||||||
from __future__ import absolute_import
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import paddle.v2 as paddle
|
|
||||||
|
|
||||||
|
|
||||||
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
|
|
||||||
padding, act):
|
|
||||||
"""Convolution layer with batch normalization.
|
|
||||||
|
|
||||||
:param input: Input layer.
|
|
||||||
:type input: LayerOutput
|
|
||||||
:param filter_size: The x dimension of a filter kernel. Or input a tuple for
|
|
||||||
two image dimension.
|
|
||||||
:type filter_size: int|tuple|list
|
|
||||||
:param num_channels_in: Number of input channels.
|
|
||||||
:type num_channels_in: int
|
|
||||||
:type num_channels_out: Number of output channels.
|
|
||||||
:type num_channels_in: out
|
|
||||||
:param padding: The x dimension of the padding. Or input a tuple for two
|
|
||||||
image dimension.
|
|
||||||
:type padding: int|tuple|list
|
|
||||||
:param act: Activation type.
|
|
||||||
:type act: BaseActivation
|
|
||||||
:return: Batch norm layer after convolution layer.
|
|
||||||
:rtype: LayerOutput
|
|
||||||
"""
|
|
||||||
conv_layer = paddle.layer.img_conv(
|
|
||||||
input=input,
|
|
||||||
filter_size=filter_size,
|
|
||||||
num_channels=num_channels_in,
|
|
||||||
num_filters=num_channels_out,
|
|
||||||
stride=stride,
|
|
||||||
padding=padding,
|
|
||||||
act=paddle.activation.Linear(),
|
|
||||||
bias_attr=False)
|
|
||||||
return paddle.layer.batch_norm(input=conv_layer, act=act)
|
|
||||||
|
|
||||||
|
|
||||||
def bidirectional_simple_rnn_bn_layer(name, input, size, act):
|
|
||||||
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
|
|
||||||
The batch normalization is only performed on input-state weights.
|
|
||||||
|
|
||||||
:param name: Name of the layer.
|
|
||||||
:type name: string
|
|
||||||
:param input: Input layer.
|
|
||||||
:type input: LayerOutput
|
|
||||||
:param size: Number of RNN cells.
|
|
||||||
:type size: int
|
|
||||||
:param act: Activation type.
|
|
||||||
:type act: BaseActivation
|
|
||||||
:return: Bidirectional simple rnn layer.
|
|
||||||
:rtype: LayerOutput
|
|
||||||
"""
|
|
||||||
# input-hidden weights shared across bi-direcitonal rnn.
|
|
||||||
input_proj = paddle.layer.fc(
|
|
||||||
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
|
|
||||||
# batch norm is only performed on input-state projection
|
|
||||||
input_proj_bn = paddle.layer.batch_norm(
|
|
||||||
input=input_proj, act=paddle.activation.Linear())
|
|
||||||
# forward and backward in time
|
|
||||||
forward_simple_rnn = paddle.layer.recurrent(
|
|
||||||
input=input_proj_bn, act=act, reverse=False)
|
|
||||||
backward_simple_rnn = paddle.layer.recurrent(
|
|
||||||
input=input_proj_bn, act=act, reverse=True)
|
|
||||||
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
|
|
||||||
|
|
||||||
|
|
||||||
def conv_group(input, num_stacks):
|
|
||||||
"""Convolution group with stacked convolution layers.
|
|
||||||
|
|
||||||
:param input: Input layer.
|
|
||||||
:type input: LayerOutput
|
|
||||||
:param num_stacks: Number of stacked convolution layers.
|
|
||||||
:type num_stacks: int
|
|
||||||
:return: Output layer of the convolution group.
|
|
||||||
:rtype: LayerOutput
|
|
||||||
"""
|
|
||||||
conv = conv_bn_layer(
|
|
||||||
input=input,
|
|
||||||
filter_size=(11, 41),
|
|
||||||
num_channels_in=1,
|
|
||||||
num_channels_out=32,
|
|
||||||
stride=(3, 2),
|
|
||||||
padding=(5, 20),
|
|
||||||
act=paddle.activation.BRelu())
|
|
||||||
for i in xrange(num_stacks - 1):
|
|
||||||
conv = conv_bn_layer(
|
|
||||||
input=conv,
|
|
||||||
filter_size=(11, 21),
|
|
||||||
num_channels_in=32,
|
|
||||||
num_channels_out=32,
|
|
||||||
stride=(1, 2),
|
|
||||||
padding=(5, 10),
|
|
||||||
act=paddle.activation.BRelu())
|
|
||||||
output_num_channels = 32
|
|
||||||
output_height = 160 // pow(2, num_stacks) + 1
|
|
||||||
return conv, output_num_channels, output_height
|
|
||||||
|
|
||||||
|
|
||||||
def rnn_group(input, size, num_stacks):
|
|
||||||
"""RNN group with stacked bidirectional simple RNN layers.
|
|
||||||
|
|
||||||
:param input: Input layer.
|
|
||||||
:type input: LayerOutput
|
|
||||||
:param size: Number of RNN cells in each layer.
|
|
||||||
:type size: int
|
|
||||||
:param num_stacks: Number of stacked rnn layers.
|
|
||||||
:type num_stacks: int
|
|
||||||
:return: Output layer of the RNN group.
|
|
||||||
:rtype: LayerOutput
|
|
||||||
"""
|
|
||||||
output = input
|
|
||||||
for i in xrange(num_stacks):
|
|
||||||
output = bidirectional_simple_rnn_bn_layer(
|
|
||||||
name=str(i), input=output, size=size, act=paddle.activation.BRelu())
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def deep_speech2(audio_data,
|
|
||||||
text_data,
|
|
||||||
dict_size,
|
|
||||||
num_conv_layers=2,
|
|
||||||
num_rnn_layers=3,
|
|
||||||
rnn_size=256):
|
|
||||||
"""
|
|
||||||
The whole DeepSpeech2 model structure (a simplified version).
|
|
||||||
|
|
||||||
:param audio_data: Audio spectrogram data layer.
|
|
||||||
:type audio_data: LayerOutput
|
|
||||||
:param text_data: Transcription text data layer.
|
|
||||||
:type text_data: LayerOutput
|
|
||||||
:param dict_size: Dictionary size for tokenized transcription.
|
|
||||||
:type dict_size: int
|
|
||||||
:param num_conv_layers: Number of stacking convolution layers.
|
|
||||||
:type num_conv_layers: int
|
|
||||||
:param num_rnn_layers: Number of stacking RNN layers.
|
|
||||||
:type num_rnn_layers: int
|
|
||||||
:param rnn_size: RNN layer size (number of RNN cells).
|
|
||||||
:type rnn_size: int
|
|
||||||
:return: A tuple of an output unnormalized log probability layer (
|
|
||||||
before softmax) and a ctc cost layer.
|
|
||||||
:rtype: tuple of LayerOutput
|
|
||||||
"""
|
|
||||||
# convolution group
|
|
||||||
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
|
|
||||||
input=audio_data, num_stacks=num_conv_layers)
|
|
||||||
# convert data form convolution feature map to sequence of vectors
|
|
||||||
conv2seq = paddle.layer.block_expand(
|
|
||||||
input=conv_group_output,
|
|
||||||
num_channels=conv_group_num_channels,
|
|
||||||
stride_x=1,
|
|
||||||
stride_y=1,
|
|
||||||
block_x=1,
|
|
||||||
block_y=conv_group_height)
|
|
||||||
# rnn group
|
|
||||||
rnn_group_output = rnn_group(
|
|
||||||
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
|
|
||||||
fc = paddle.layer.fc(
|
|
||||||
input=rnn_group_output,
|
|
||||||
size=dict_size + 1,
|
|
||||||
act=paddle.activation.Linear(),
|
|
||||||
bias_attr=True)
|
|
||||||
# probability distribution with softmax
|
|
||||||
log_probs = paddle.layer.mixed(
|
|
||||||
input=paddle.layer.identity_projection(input=fc),
|
|
||||||
act=paddle.activation.Softmax())
|
|
||||||
# ctc cost
|
|
||||||
ctc_loss = paddle.layer.warp_ctc(
|
|
||||||
input=fc,
|
|
||||||
label=text_data,
|
|
||||||
size=dict_size + 1,
|
|
||||||
blank=dict_size,
|
|
||||||
norm_by_times=True)
|
|
||||||
return log_probs, ctc_loss
|
|
@ -0,0 +1,274 @@
|
|||||||
|
"""Contains DeepSpeech2 layers and networks."""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import paddle.v2 as paddle
|
||||||
|
|
||||||
|
|
||||||
|
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
|
||||||
|
padding, act):
|
||||||
|
"""Convolution layer with batch normalization.
|
||||||
|
|
||||||
|
:param input: Input layer.
|
||||||
|
:type input: LayerOutput
|
||||||
|
:param filter_size: The x dimension of a filter kernel. Or input a tuple for
|
||||||
|
two image dimension.
|
||||||
|
:type filter_size: int|tuple|list
|
||||||
|
:param num_channels_in: Number of input channels.
|
||||||
|
:type num_channels_in: int
|
||||||
|
:type num_channels_out: Number of output channels.
|
||||||
|
:type num_channels_in: out
|
||||||
|
:param padding: The x dimension of the padding. Or input a tuple for two
|
||||||
|
image dimension.
|
||||||
|
:type padding: int|tuple|list
|
||||||
|
:param act: Activation type.
|
||||||
|
:type act: BaseActivation
|
||||||
|
:return: Batch norm layer after convolution layer.
|
||||||
|
:rtype: LayerOutput
|
||||||
|
"""
|
||||||
|
conv_layer = paddle.layer.img_conv(
|
||||||
|
input=input,
|
||||||
|
filter_size=filter_size,
|
||||||
|
num_channels=num_channels_in,
|
||||||
|
num_filters=num_channels_out,
|
||||||
|
stride=stride,
|
||||||
|
padding=padding,
|
||||||
|
act=paddle.activation.Linear(),
|
||||||
|
bias_attr=False)
|
||||||
|
return paddle.layer.batch_norm(input=conv_layer, act=act)
|
||||||
|
|
||||||
|
|
||||||
|
def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
|
||||||
|
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
|
||||||
|
The batch normalization is only performed on input-state weights.
|
||||||
|
|
||||||
|
:param name: Name of the layer.
|
||||||
|
:type name: string
|
||||||
|
:param input: Input layer.
|
||||||
|
:type input: LayerOutput
|
||||||
|
:param size: Number of RNN cells.
|
||||||
|
:type size: int
|
||||||
|
:param act: Activation type.
|
||||||
|
:type act: BaseActivation
|
||||||
|
:param share_weights: Whether to share input-hidden weights between
|
||||||
|
forward and backward directional RNNs.
|
||||||
|
:type share_weights: bool
|
||||||
|
:return: Bidirectional simple rnn layer.
|
||||||
|
:rtype: LayerOutput
|
||||||
|
"""
|
||||||
|
if share_weights:
|
||||||
|
# input-hidden weights shared between bi-direcitonal rnn.
|
||||||
|
input_proj = paddle.layer.fc(
|
||||||
|
input=input,
|
||||||
|
size=size,
|
||||||
|
act=paddle.activation.Linear(),
|
||||||
|
bias_attr=False)
|
||||||
|
# batch norm is only performed on input-state projection
|
||||||
|
input_proj_bn = paddle.layer.batch_norm(
|
||||||
|
input=input_proj, act=paddle.activation.Linear())
|
||||||
|
# forward and backward in time
|
||||||
|
forward_simple_rnn = paddle.layer.recurrent(
|
||||||
|
input=input_proj_bn, act=act, reverse=False)
|
||||||
|
backward_simple_rnn = paddle.layer.recurrent(
|
||||||
|
input=input_proj_bn, act=act, reverse=True)
|
||||||
|
|
||||||
|
else:
|
||||||
|
input_proj_forward = paddle.layer.fc(
|
||||||
|
input=input,
|
||||||
|
size=size,
|
||||||
|
act=paddle.activation.Linear(),
|
||||||
|
bias_attr=False)
|
||||||
|
input_proj_backward = paddle.layer.fc(
|
||||||
|
input=input,
|
||||||
|
size=size,
|
||||||
|
act=paddle.activation.Linear(),
|
||||||
|
bias_attr=False)
|
||||||
|
# batch norm is only performed on input-state projection
|
||||||
|
input_proj_bn_forward = paddle.layer.batch_norm(
|
||||||
|
input=input_proj_forward, act=paddle.activation.Linear())
|
||||||
|
input_proj_bn_backward = paddle.layer.batch_norm(
|
||||||
|
input=input_proj_backward, act=paddle.activation.Linear())
|
||||||
|
# forward and backward in time
|
||||||
|
forward_simple_rnn = paddle.layer.recurrent(
|
||||||
|
input=input_proj_bn_forward, act=act, reverse=False)
|
||||||
|
backward_simple_rnn = paddle.layer.recurrent(
|
||||||
|
input=input_proj_bn_backward, act=act, reverse=True)
|
||||||
|
|
||||||
|
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
|
||||||
|
|
||||||
|
|
||||||
|
def bidirectional_gru_bn_layer(name, input, size, act):
|
||||||
|
"""Bidirectonal gru layer with sequence-wise batch normalization.
|
||||||
|
The batch normalization is only performed on input-state weights.
|
||||||
|
|
||||||
|
:param name: Name of the layer.
|
||||||
|
:type name: string
|
||||||
|
:param input: Input layer.
|
||||||
|
:type input: LayerOutput
|
||||||
|
:param size: Number of RNN cells.
|
||||||
|
:type size: int
|
||||||
|
:param act: Activation type.
|
||||||
|
:type act: BaseActivation
|
||||||
|
:return: Bidirectional simple rnn layer.
|
||||||
|
:rtype: LayerOutput
|
||||||
|
"""
|
||||||
|
input_proj_forward = paddle.layer.fc(
|
||||||
|
input=input,
|
||||||
|
size=size * 3,
|
||||||
|
act=paddle.activation.Linear(),
|
||||||
|
bias_attr=False)
|
||||||
|
input_proj_backward = paddle.layer.fc(
|
||||||
|
input=input,
|
||||||
|
size=size * 3,
|
||||||
|
act=paddle.activation.Linear(),
|
||||||
|
bias_attr=False)
|
||||||
|
# batch norm is only performed on input-related projections
|
||||||
|
input_proj_bn_forward = paddle.layer.batch_norm(
|
||||||
|
input=input_proj_forward, act=paddle.activation.Linear())
|
||||||
|
input_proj_bn_backward = paddle.layer.batch_norm(
|
||||||
|
input=input_proj_backward, act=paddle.activation.Linear())
|
||||||
|
# forward and backward in time
|
||||||
|
forward_gru = paddle.layer.grumemory(
|
||||||
|
input=input_proj_bn_forward, act=act, reverse=False)
|
||||||
|
backward_gru = paddle.layer.grumemory(
|
||||||
|
input=input_proj_bn_backward, act=act, reverse=True)
|
||||||
|
return paddle.layer.concat(input=[forward_gru, backward_gru])
|
||||||
|
|
||||||
|
|
||||||
|
def conv_group(input, num_stacks):
|
||||||
|
"""Convolution group with stacked convolution layers.
|
||||||
|
|
||||||
|
:param input: Input layer.
|
||||||
|
:type input: LayerOutput
|
||||||
|
:param num_stacks: Number of stacked convolution layers.
|
||||||
|
:type num_stacks: int
|
||||||
|
:return: Output layer of the convolution group.
|
||||||
|
:rtype: LayerOutput
|
||||||
|
"""
|
||||||
|
conv = conv_bn_layer(
|
||||||
|
input=input,
|
||||||
|
filter_size=(11, 41),
|
||||||
|
num_channels_in=1,
|
||||||
|
num_channels_out=32,
|
||||||
|
stride=(3, 2),
|
||||||
|
padding=(5, 20),
|
||||||
|
act=paddle.activation.BRelu())
|
||||||
|
for i in xrange(num_stacks - 1):
|
||||||
|
conv = conv_bn_layer(
|
||||||
|
input=conv,
|
||||||
|
filter_size=(11, 21),
|
||||||
|
num_channels_in=32,
|
||||||
|
num_channels_out=32,
|
||||||
|
stride=(1, 2),
|
||||||
|
padding=(5, 10),
|
||||||
|
act=paddle.activation.BRelu())
|
||||||
|
output_num_channels = 32
|
||||||
|
output_height = 160 // pow(2, num_stacks) + 1
|
||||||
|
return conv, output_num_channels, output_height
|
||||||
|
|
||||||
|
|
||||||
|
def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
|
||||||
|
"""RNN group with stacked bidirectional simple RNN layers.
|
||||||
|
|
||||||
|
:param input: Input layer.
|
||||||
|
:type input: LayerOutput
|
||||||
|
:param size: Number of RNN cells in each layer.
|
||||||
|
:type size: int
|
||||||
|
:param num_stacks: Number of stacked rnn layers.
|
||||||
|
:type num_stacks: int
|
||||||
|
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
||||||
|
:type use_gru: bool
|
||||||
|
:param share_rnn_weights: Whether to share input-hidden weights between
|
||||||
|
forward and backward directional RNNs.
|
||||||
|
It is only available when use_gru=False.
|
||||||
|
:type share_weights: bool
|
||||||
|
:return: Output layer of the RNN group.
|
||||||
|
:rtype: LayerOutput
|
||||||
|
"""
|
||||||
|
output = input
|
||||||
|
for i in xrange(num_stacks):
|
||||||
|
if use_gru:
|
||||||
|
output = bidirectional_gru_bn_layer(
|
||||||
|
name=str(i),
|
||||||
|
input=output,
|
||||||
|
size=size,
|
||||||
|
act=paddle.activation.Relu())
|
||||||
|
# BRelu does not support hppl, need to add later. Use Relu instead.
|
||||||
|
else:
|
||||||
|
output = bidirectional_simple_rnn_bn_layer(
|
||||||
|
name=str(i),
|
||||||
|
input=output,
|
||||||
|
size=size,
|
||||||
|
act=paddle.activation.BRelu(),
|
||||||
|
share_weights=share_rnn_weights)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def deep_speech_v2_network(audio_data,
|
||||||
|
text_data,
|
||||||
|
dict_size,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=256,
|
||||||
|
use_gru=False,
|
||||||
|
share_rnn_weights=True):
|
||||||
|
"""The DeepSpeech2 network structure.
|
||||||
|
|
||||||
|
:param audio_data: Audio spectrogram data layer.
|
||||||
|
:type audio_data: LayerOutput
|
||||||
|
:param text_data: Transcription text data layer.
|
||||||
|
:type text_data: LayerOutput
|
||||||
|
:param dict_size: Dictionary size for tokenized transcription.
|
||||||
|
:type dict_size: int
|
||||||
|
:param num_conv_layers: Number of stacking convolution layers.
|
||||||
|
:type num_conv_layers: int
|
||||||
|
:param num_rnn_layers: Number of stacking RNN layers.
|
||||||
|
:type num_rnn_layers: int
|
||||||
|
:param rnn_size: RNN layer size (number of RNN cells).
|
||||||
|
:type rnn_size: int
|
||||||
|
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
||||||
|
:type use_gru: bool
|
||||||
|
:param share_rnn_weights: Whether to share input-hidden weights between
|
||||||
|
forward and backward direction RNNs.
|
||||||
|
It is only available when use_gru=False.
|
||||||
|
:type share_weights: bool
|
||||||
|
:return: A tuple of an output unnormalized log probability layer (
|
||||||
|
before softmax) and a ctc cost layer.
|
||||||
|
:rtype: tuple of LayerOutput
|
||||||
|
"""
|
||||||
|
# convolution group
|
||||||
|
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
|
||||||
|
input=audio_data, num_stacks=num_conv_layers)
|
||||||
|
# convert data form convolution feature map to sequence of vectors
|
||||||
|
conv2seq = paddle.layer.block_expand(
|
||||||
|
input=conv_group_output,
|
||||||
|
num_channels=conv_group_num_channels,
|
||||||
|
stride_x=1,
|
||||||
|
stride_y=1,
|
||||||
|
block_x=1,
|
||||||
|
block_y=conv_group_height)
|
||||||
|
# rnn group
|
||||||
|
rnn_group_output = rnn_group(
|
||||||
|
input=conv2seq,
|
||||||
|
size=rnn_size,
|
||||||
|
num_stacks=num_rnn_layers,
|
||||||
|
use_gru=use_gru,
|
||||||
|
share_rnn_weights=share_rnn_weights)
|
||||||
|
fc = paddle.layer.fc(
|
||||||
|
input=rnn_group_output,
|
||||||
|
size=dict_size + 1,
|
||||||
|
act=paddle.activation.Linear(),
|
||||||
|
bias_attr=True)
|
||||||
|
# probability distribution with softmax
|
||||||
|
log_probs = paddle.layer.mixed(
|
||||||
|
input=paddle.layer.identity_projection(input=fc),
|
||||||
|
act=paddle.activation.Softmax())
|
||||||
|
# ctc cost
|
||||||
|
ctc_loss = paddle.layer.warp_ctc(
|
||||||
|
input=fc,
|
||||||
|
label=text_data,
|
||||||
|
size=dict_size + 1,
|
||||||
|
blank=dict_size,
|
||||||
|
norm_by_times=True)
|
||||||
|
return log_probs, ctc_loss
|
@ -0,0 +1,121 @@
|
|||||||
|
"""Evaluation for DeepSpeech2 model."""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import functools
|
||||||
|
import paddle.v2 as paddle
|
||||||
|
from data_utils.data import DataGenerator
|
||||||
|
from models.model import DeepSpeech2Model
|
||||||
|
from utils.error_rate import wer, cer
|
||||||
|
from utils.utility import add_arguments, print_arguments
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
add_arg = functools.partial(add_arguments, argparser=parser)
|
||||||
|
# yapf: disable
|
||||||
|
add_arg('batch_size', int, 128, "Minibatch size.")
|
||||||
|
add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
|
||||||
|
add_arg('beam_size', int, 500, "Beam search width.")
|
||||||
|
add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.")
|
||||||
|
add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.")
|
||||||
|
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
|
||||||
|
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
|
||||||
|
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
|
||||||
|
add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
|
||||||
|
add_arg('beta', float, 0.25, "Coef of WC for beam search.")
|
||||||
|
add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
|
||||||
|
add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
|
||||||
|
add_arg('use_gpu', bool, True, "Use GPU or not.")
|
||||||
|
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
|
||||||
|
"bi-directional RNNs. Not for GRU.")
|
||||||
|
add_arg('test_manifest', str,
|
||||||
|
'data/librispeech/manifest.test-clean',
|
||||||
|
"Filepath of manifest to evaluate.")
|
||||||
|
add_arg('mean_std_path', str,
|
||||||
|
'data/librispeech/mean_std.npz',
|
||||||
|
"Filepath of normalizer's mean & std.")
|
||||||
|
add_arg('vocab_path', str,
|
||||||
|
'data/librispeech/eng_vocab.txt',
|
||||||
|
"Filepath of vocabulary.")
|
||||||
|
add_arg('model_path', str,
|
||||||
|
'./checkpoints/params.latest.tar.gz',
|
||||||
|
"If None, the training starts from scratch, "
|
||||||
|
"otherwise, it resumes from the pre-trained model.")
|
||||||
|
add_arg('lang_model_path', str,
|
||||||
|
'lm/data/common_crawl_00.prune01111.trie.klm',
|
||||||
|
"Filepath for language model.")
|
||||||
|
add_arg('decoding_method', str,
|
||||||
|
'ctc_beam_search',
|
||||||
|
"Decoding method. Options: ctc_beam_search, ctc_greedy",
|
||||||
|
choices = ['ctc_beam_search', 'ctc_greedy'])
|
||||||
|
add_arg('error_rate_type', str,
|
||||||
|
'wer',
|
||||||
|
"Error rate type for evaluation.",
|
||||||
|
choices=['wer', 'cer'])
|
||||||
|
add_arg('specgram_type', str,
|
||||||
|
'linear',
|
||||||
|
"Audio feature type. Options: linear, mfcc.",
|
||||||
|
choices=['linear', 'mfcc'])
|
||||||
|
# yapf: disable
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate():
|
||||||
|
"""Evaluate on whole test data for DeepSpeech2."""
|
||||||
|
data_generator = DataGenerator(
|
||||||
|
vocab_filepath=args.vocab_path,
|
||||||
|
mean_std_filepath=args.mean_std_path,
|
||||||
|
augmentation_config='{}',
|
||||||
|
specgram_type=args.specgram_type,
|
||||||
|
num_threads=args.num_proc_data)
|
||||||
|
batch_reader = data_generator.batch_reader_creator(
|
||||||
|
manifest_path=args.test_manifest,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
min_batch_size=1,
|
||||||
|
sortagrad=False,
|
||||||
|
shuffle_method=None)
|
||||||
|
|
||||||
|
ds2_model = DeepSpeech2Model(
|
||||||
|
vocab_size=data_generator.vocab_size,
|
||||||
|
num_conv_layers=args.num_conv_layers,
|
||||||
|
num_rnn_layers=args.num_rnn_layers,
|
||||||
|
rnn_layer_size=args.rnn_layer_size,
|
||||||
|
use_gru=args.use_gru,
|
||||||
|
pretrained_model_path=args.model_path,
|
||||||
|
share_rnn_weights=args.share_rnn_weights)
|
||||||
|
|
||||||
|
error_rate_func = cer if args.error_rate_type == 'cer' else wer
|
||||||
|
error_sum, num_ins = 0.0, 0
|
||||||
|
for infer_data in batch_reader():
|
||||||
|
result_transcripts = ds2_model.infer_batch(
|
||||||
|
infer_data=infer_data,
|
||||||
|
decoding_method=args.decoding_method,
|
||||||
|
beam_alpha=args.alpha,
|
||||||
|
beam_beta=args.beta,
|
||||||
|
beam_size=args.beam_size,
|
||||||
|
cutoff_prob=args.cutoff_prob,
|
||||||
|
vocab_list=data_generator.vocab_list,
|
||||||
|
language_model_path=args.lang_model_path,
|
||||||
|
num_processes=args.num_proc_bsearch)
|
||||||
|
target_transcripts = [
|
||||||
|
''.join([data_generator.vocab_list[token] for token in transcript])
|
||||||
|
for _, transcript in infer_data
|
||||||
|
]
|
||||||
|
for target, result in zip(target_transcripts, result_transcripts):
|
||||||
|
error_sum += error_rate_func(target, result)
|
||||||
|
num_ins += 1
|
||||||
|
print("Error rate [%s] (%d/?) = %f" %
|
||||||
|
(args.error_rate_type, num_ins, error_sum / num_ins))
|
||||||
|
print("Final error rate [%s] (%d/%d) = %f" %
|
||||||
|
(args.error_rate_type, num_ins, num_ins, error_sum / num_ins))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print_arguments(args)
|
||||||
|
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
|
||||||
|
evaluate()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -1,23 +0,0 @@
|
|||||||
"""Test Setup."""
|
|
||||||
import unittest
|
|
||||||
import numpy as np
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
class TestSetup(unittest.TestCase):
|
|
||||||
def test_soundfile(self):
|
|
||||||
import soundfile as sf
|
|
||||||
# floating point data is typically limited to the interval [-1.0, 1.0],
|
|
||||||
# but smaller/larger values are supported as well
|
|
||||||
data = np.array([[1.75, -1.75], [1.0, -1.0], [0.5, -0.5],
|
|
||||||
[0.25, -0.25]])
|
|
||||||
file = 'test.wav'
|
|
||||||
sf.write(file, data, 44100, format='WAV', subtype='FLOAT')
|
|
||||||
read, fs = sf.read(file)
|
|
||||||
self.assertTrue(np.all(read == data))
|
|
||||||
self.assertEqual(fs, 44100)
|
|
||||||
os.remove(file)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
unittest.main()
|
|
@ -0,0 +1,131 @@
|
|||||||
|
"""Beam search parameters tuning for DeepSpeech2 model."""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import argparse
|
||||||
|
import functools
|
||||||
|
import paddle.v2 as paddle
|
||||||
|
import _init_paths
|
||||||
|
from data_utils.data import DataGenerator
|
||||||
|
from models.model import DeepSpeech2Model
|
||||||
|
from utils.error_rate import wer
|
||||||
|
from utils.utility import add_arguments, print_arguments
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
add_arg = functools.partial(add_arguments, argparser=parser)
|
||||||
|
# yapf: disable
|
||||||
|
add_arg('num_samples', int, 100, "# of samples to infer.")
|
||||||
|
add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
|
||||||
|
add_arg('beam_size', int, 500, "Beam search width.")
|
||||||
|
add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.")
|
||||||
|
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
|
||||||
|
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
|
||||||
|
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
|
||||||
|
add_arg('num_alphas', int, 14, "# of alpha candidates for tuning.")
|
||||||
|
add_arg('num_betas', int, 20, "# of beta candidates for tuning.")
|
||||||
|
add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.")
|
||||||
|
add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.")
|
||||||
|
add_arg('beta_from', float, 0.05, "Where beta starts tuning from.")
|
||||||
|
add_arg('beta_to', float, 1.0, "Where beta ends tuning with.")
|
||||||
|
add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
|
||||||
|
add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
|
||||||
|
add_arg('use_gpu', bool, True, "Use GPU or not.")
|
||||||
|
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
|
||||||
|
"bi-directional RNNs. Not for GRU.")
|
||||||
|
add_arg('tune_manifest', str,
|
||||||
|
'data/librispeech/manifest.dev',
|
||||||
|
"Filepath of manifest to tune.")
|
||||||
|
add_arg('mean_std_path', str,
|
||||||
|
'data/librispeech/mean_std.npz',
|
||||||
|
"Filepath of normalizer's mean & std.")
|
||||||
|
add_arg('vocab_path', str,
|
||||||
|
'data/librispeech/eng_vocab.txt',
|
||||||
|
"Filepath of vocabulary.")
|
||||||
|
add_arg('lang_model_path', str,
|
||||||
|
'lm/data/common_crawl_00.prune01111.trie.klm',
|
||||||
|
"Filepath for language model.")
|
||||||
|
add_arg('model_path', str,
|
||||||
|
'./checkpoints/params.latest.tar.gz',
|
||||||
|
"If None, the training starts from scratch, "
|
||||||
|
"otherwise, it resumes from the pre-trained model.")
|
||||||
|
add_arg('error_rate_type', str,
|
||||||
|
'wer',
|
||||||
|
"Error rate type for evaluation.",
|
||||||
|
choices=['wer', 'cer'])
|
||||||
|
add_arg('specgram_type', str,
|
||||||
|
'linear',
|
||||||
|
"Audio feature type. Options: linear, mfcc.",
|
||||||
|
choices=['linear', 'mfcc'])
|
||||||
|
# yapf: disable
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def tune():
|
||||||
|
"""Tune parameters alpha and beta on one minibatch."""
|
||||||
|
if not args.num_alphas >= 0:
|
||||||
|
raise ValueError("num_alphas must be non-negative!")
|
||||||
|
if not args.num_betas >= 0:
|
||||||
|
raise ValueError("num_betas must be non-negative!")
|
||||||
|
|
||||||
|
data_generator = DataGenerator(
|
||||||
|
vocab_filepath=args.vocab_path,
|
||||||
|
mean_std_filepath=args.mean_std_path,
|
||||||
|
augmentation_config='{}',
|
||||||
|
specgram_type=args.specgram_type,
|
||||||
|
num_threads=1)
|
||||||
|
batch_reader = data_generator.batch_reader_creator(
|
||||||
|
manifest_path=args.tune_manifest,
|
||||||
|
batch_size=args.num_samples,
|
||||||
|
sortagrad=False,
|
||||||
|
shuffle_method=None)
|
||||||
|
tune_data = batch_reader().next()
|
||||||
|
target_transcripts = [
|
||||||
|
''.join([data_generator.vocab_list[token] for token in transcript])
|
||||||
|
for _, transcript in tune_data
|
||||||
|
]
|
||||||
|
|
||||||
|
ds2_model = DeepSpeech2Model(
|
||||||
|
vocab_size=data_generator.vocab_size,
|
||||||
|
num_conv_layers=args.num_conv_layers,
|
||||||
|
num_rnn_layers=args.num_rnn_layers,
|
||||||
|
rnn_layer_size=args.rnn_layer_size,
|
||||||
|
use_gru=args.use_gru,
|
||||||
|
pretrained_model_path=args.model_path,
|
||||||
|
share_rnn_weights=args.share_rnn_weights)
|
||||||
|
|
||||||
|
# create grid for search
|
||||||
|
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
|
||||||
|
cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
|
||||||
|
params_grid = [(alpha, beta) for alpha in cand_alphas
|
||||||
|
for beta in cand_betas]
|
||||||
|
|
||||||
|
## tune parameters in loop
|
||||||
|
for alpha, beta in params_grid:
|
||||||
|
result_transcripts = ds2_model.infer_batch(
|
||||||
|
infer_data=tune_data,
|
||||||
|
decoding_method='ctc_beam_search',
|
||||||
|
beam_alpha=alpha,
|
||||||
|
beam_beta=beta,
|
||||||
|
beam_size=args.beam_size,
|
||||||
|
cutoff_prob=args.cutoff_prob,
|
||||||
|
vocab_list=data_generator.vocab_list,
|
||||||
|
language_model_path=args.lang_model_path,
|
||||||
|
num_processes=args.num_proc_bsearch)
|
||||||
|
wer_sum, num_ins = 0.0, 0
|
||||||
|
for target, result in zip(target_transcripts, result_transcripts):
|
||||||
|
wer_sum += wer(target, result)
|
||||||
|
num_ins += 1
|
||||||
|
print("alpha = %f\tbeta = %f\tWER = %f" %
|
||||||
|
(alpha, beta, wer_sum / num_ins))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print_arguments(args)
|
||||||
|
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
|
||||||
|
tune()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -1,196 +0,0 @@
|
|||||||
"""Parameters tuning for DeepSpeech2 model."""
|
|
||||||
from __future__ import absolute_import
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import distutils.util
|
|
||||||
import argparse
|
|
||||||
import multiprocessing
|
|
||||||
import paddle.v2 as paddle
|
|
||||||
from data_utils.data import DataGenerator
|
|
||||||
from model import DeepSpeech2Model
|
|
||||||
from error_rate import wer
|
|
||||||
import utils
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description=__doc__)
|
|
||||||
parser.add_argument(
|
|
||||||
"--num_samples",
|
|
||||||
default=100,
|
|
||||||
type=int,
|
|
||||||
help="Number of samples for parameters tuning. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--num_conv_layers",
|
|
||||||
default=2,
|
|
||||||
type=int,
|
|
||||||
help="Convolution layer number. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--num_rnn_layers",
|
|
||||||
default=3,
|
|
||||||
type=int,
|
|
||||||
help="RNN layer number. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--rnn_layer_size",
|
|
||||||
default=512,
|
|
||||||
type=int,
|
|
||||||
help="RNN layer cell number. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--use_gpu",
|
|
||||||
default=True,
|
|
||||||
type=distutils.util.strtobool,
|
|
||||||
help="Use gpu or not. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--trainer_count",
|
|
||||||
default=8,
|
|
||||||
type=int,
|
|
||||||
help="Trainer number. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--num_threads_data",
|
|
||||||
default=1,
|
|
||||||
type=int,
|
|
||||||
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--num_processes_beam_search",
|
|
||||||
default=multiprocessing.cpu_count() // 2,
|
|
||||||
type=int,
|
|
||||||
help="Number of cpu processes for beam search. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--specgram_type",
|
|
||||||
default='linear',
|
|
||||||
type=str,
|
|
||||||
help="Feature type of audio data: 'linear' (power spectrum)"
|
|
||||||
" or 'mfcc'. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--mean_std_filepath",
|
|
||||||
default='mean_std.npz',
|
|
||||||
type=str,
|
|
||||||
help="Manifest path for normalizer. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--tune_manifest_path",
|
|
||||||
default='datasets/manifest.dev',
|
|
||||||
type=str,
|
|
||||||
help="Manifest path for tuning. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--model_filepath",
|
|
||||||
default='checkpoints/params.latest.tar.gz',
|
|
||||||
type=str,
|
|
||||||
help="Model filepath. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab_filepath",
|
|
||||||
default='datasets/vocab/eng_vocab.txt',
|
|
||||||
type=str,
|
|
||||||
help="Vocabulary filepath. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--beam_size",
|
|
||||||
default=500,
|
|
||||||
type=int,
|
|
||||||
help="Width for beam search decoding. (default: %(default)d)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--language_model_path",
|
|
||||||
default="lm/data/common_crawl_00.prune01111.trie.klm",
|
|
||||||
type=str,
|
|
||||||
help="Path for language model. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--alpha_from",
|
|
||||||
default=0.1,
|
|
||||||
type=float,
|
|
||||||
help="Where alpha starts from. (default: %(default)f)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--num_alphas",
|
|
||||||
default=14,
|
|
||||||
type=int,
|
|
||||||
help="Number of candidate alphas. (default: %(default)d)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--alpha_to",
|
|
||||||
default=0.36,
|
|
||||||
type=float,
|
|
||||||
help="Where alpha ends with. (default: %(default)f)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--beta_from",
|
|
||||||
default=0.05,
|
|
||||||
type=float,
|
|
||||||
help="Where beta starts from. (default: %(default)f)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--num_betas",
|
|
||||||
default=20,
|
|
||||||
type=float,
|
|
||||||
help="Number of candidate betas. (default: %(default)d)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--beta_to",
|
|
||||||
default=1.0,
|
|
||||||
type=float,
|
|
||||||
help="Where beta ends with. (default: %(default)f)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--cutoff_prob",
|
|
||||||
default=0.99,
|
|
||||||
type=float,
|
|
||||||
help="The cutoff probability of pruning"
|
|
||||||
"in beam search. (default: %(default)f)")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def tune():
|
|
||||||
"""Tune parameters alpha and beta on one minibatch."""
|
|
||||||
if not args.num_alphas >= 0:
|
|
||||||
raise ValueError("num_alphas must be non-negative!")
|
|
||||||
if not args.num_betas >= 0:
|
|
||||||
raise ValueError("num_betas must be non-negative!")
|
|
||||||
|
|
||||||
data_generator = DataGenerator(
|
|
||||||
vocab_filepath=args.vocab_filepath,
|
|
||||||
mean_std_filepath=args.mean_std_filepath,
|
|
||||||
augmentation_config='{}',
|
|
||||||
specgram_type=args.specgram_type,
|
|
||||||
num_threads=args.num_threads_data)
|
|
||||||
batch_reader = data_generator.batch_reader_creator(
|
|
||||||
manifest_path=args.tune_manifest_path,
|
|
||||||
batch_size=args.num_samples,
|
|
||||||
sortagrad=False,
|
|
||||||
shuffle_method=None)
|
|
||||||
tune_data = batch_reader().next()
|
|
||||||
target_transcripts = [
|
|
||||||
''.join([data_generator.vocab_list[token] for token in transcript])
|
|
||||||
for _, transcript in tune_data
|
|
||||||
]
|
|
||||||
|
|
||||||
ds2_model = DeepSpeech2Model(
|
|
||||||
vocab_size=data_generator.vocab_size,
|
|
||||||
num_conv_layers=args.num_conv_layers,
|
|
||||||
num_rnn_layers=args.num_rnn_layers,
|
|
||||||
rnn_layer_size=args.rnn_layer_size,
|
|
||||||
pretrained_model_path=args.model_filepath)
|
|
||||||
|
|
||||||
# create grid for search
|
|
||||||
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
|
|
||||||
cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
|
|
||||||
params_grid = [(alpha, beta) for alpha in cand_alphas
|
|
||||||
for beta in cand_betas]
|
|
||||||
|
|
||||||
## tune parameters in loop
|
|
||||||
for alpha, beta in params_grid:
|
|
||||||
result_transcripts = ds2_model.infer_batch(
|
|
||||||
infer_data=tune_data,
|
|
||||||
decode_method='beam_search',
|
|
||||||
beam_alpha=alpha,
|
|
||||||
beam_beta=beta,
|
|
||||||
beam_size=args.beam_size,
|
|
||||||
cutoff_prob=args.cutoff_prob,
|
|
||||||
vocab_list=data_generator.vocab_list,
|
|
||||||
language_model_path=args.language_model_path,
|
|
||||||
num_processes=args.num_processes_beam_search)
|
|
||||||
wer_sum, num_ins = 0.0, 0
|
|
||||||
for target, result in zip(target_transcripts, result_transcripts):
|
|
||||||
wer_sum += wer(target, result)
|
|
||||||
num_ins += 1
|
|
||||||
print("alpha = %f\tbeta = %f\tWER = %f" %
|
|
||||||
(alpha, beta, wer_sum / num_ins))
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
utils.print_arguments(args)
|
|
||||||
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
|
|
||||||
tune()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
@ -1,25 +0,0 @@
|
|||||||
"""Contains common utility functions."""
|
|
||||||
from __future__ import absolute_import
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
|
|
||||||
def print_arguments(args):
|
|
||||||
"""Print argparse's arguments.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("name", default="Jonh", type=str, help="User name.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
print_arguments(args)
|
|
||||||
|
|
||||||
:param args: Input argparse.Namespace for printing.
|
|
||||||
:type args: argparse.Namespace
|
|
||||||
"""
|
|
||||||
print("----- Configuration Arguments -----")
|
|
||||||
for arg, value in vars(args).iteritems():
|
|
||||||
print("%s: %s" % (arg, value))
|
|
||||||
print("------------------------------------")
|
|
@ -0,0 +1,47 @@
|
|||||||
|
"""Contains common utility functions."""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import distutils.util
|
||||||
|
|
||||||
|
|
||||||
|
def print_arguments(args):
|
||||||
|
"""Print argparse's arguments.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("name", default="Jonh", type=str, help="User name.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
print_arguments(args)
|
||||||
|
|
||||||
|
:param args: Input argparse.Namespace for printing.
|
||||||
|
:type args: argparse.Namespace
|
||||||
|
"""
|
||||||
|
print("----------- Configuration Arguments -----------")
|
||||||
|
for arg, value in sorted(vars(args).iteritems()):
|
||||||
|
print("%s: %s" % (arg, value))
|
||||||
|
print("------------------------------------------------")
|
||||||
|
|
||||||
|
|
||||||
|
def add_arguments(argname, type, default, help, argparser, **kwargs):
|
||||||
|
"""Add argparse's argument.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
add_argument("name", str, "Jonh", "User name.", parser)
|
||||||
|
args = parser.parse_args()
|
||||||
|
"""
|
||||||
|
type = distutils.util.strtobool if type == bool else type
|
||||||
|
argparser.add_argument(
|
||||||
|
"--" + argname,
|
||||||
|
default=default,
|
||||||
|
type=type,
|
||||||
|
help=help + ' Default: %(default)s.',
|
||||||
|
**kwargs)
|
Loading…
Reference in new issue