Merge branch 'develop' of https://github.com/PaddlePaddle/models into ctc_decoder_deploy

pull/2/head
Yibing Liu 7 years ago
commit 11ede80a48

@ -1,5 +1,7 @@
# DeepSpeech2 on PaddlePaddle # DeepSpeech2 on PaddlePaddle
>TODO: to be updated, since the directory hierarchy was changed.
## Installation ## Installation
``` ```

@ -1,7 +1,9 @@
TRAIN_MANIFEST="cloud/cloud.manifest.train" #! /usr/bin/bash
DEV_MANIFEST="cloud/cloud.manifest.dev"
CLOUD_MODEL_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/model" TRAIN_MANIFEST="cloud/cloud_manifests/cloud.manifest.train"
BATCH_SIZE=256 DEV_MANIFEST="cloud/cloud_manifests/cloud.manifest.dev"
CLOUD_MODEL_DIR="./checkpoints"
BATCH_SIZE=512
NUM_GPU=8 NUM_GPU=8
NUM_NODE=1 NUM_NODE=1
IS_LOCAL="True" IS_LOCAL="True"
@ -11,7 +13,7 @@ DS2_PATH=${PWD%/*}
cp -f pcloud_train.sh ${DS2_PATH} cp -f pcloud_train.sh ${DS2_PATH}
paddlecloud submit \ paddlecloud submit \
-image bootstrapper:5000/wanghaoshuang/pcloud_ds2:latest \ -image bootstrapper:5000/paddlepaddle/pcloud_ds2:latest \
-jobname ${JOB_NAME} \ -jobname ${JOB_NAME} \
-cpu ${NUM_GPU} \ -cpu ${NUM_GPU} \
-gpu ${NUM_GPU} \ -gpu ${NUM_GPU} \

@ -1,3 +1,5 @@
#! /usr/bin/bash
TRAIN_MANIFEST=$1 TRAIN_MANIFEST=$1
DEV_MANIFEST=$2 DEV_MANIFEST=$2
MODEL_PATH=$3 MODEL_PATH=$3
@ -13,12 +15,30 @@ python ./cloud/split_data.py \
--in_manifest_path=${DEV_MANIFEST} \ --in_manifest_path=${DEV_MANIFEST} \
--out_manifest_path='/local.manifest.dev' --out_manifest_path='/local.manifest.dev'
python train.py \ python -u train.py \
--batch_size=$BATCH_SIZE \ --batch_size=${BATCH_SIZE} \
--use_gpu=1 \
--trainer_count=${NUM_GPU} \ --trainer_count=${NUM_GPU} \
--num_threads_data=${NUM_GPU} \ --num_passes=200 \
--num_proc_data=${NUM_GPU} \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--num_iter_print=100 \
--learning_rate=5e-4 \
--max_duration=27.0 \
--min_duration=0.0 \
--use_sortagrad=True \
--use_gru=False \
--use_gpu=True \
--is_local=${IS_LOCAL} \ --is_local=${IS_LOCAL} \
--train_manifest_path='/local.manifest.train' \ --share_rnn_weights=True \
--dev_manifest_path='/local.manifest.dev' \ --train_manifest='/local.manifest.train' \
--dev_manifest='/local.manifest.dev' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--output_model_dir='./checkpoints' \
--output_model_dir=${MODEL_PATH} \ --output_model_dir=${MODEL_PATH} \
--augment_conf_path='conf/augmentation.config' \
--specgram_type='linear' \
--shuffle_method='batch_shuffle_clipped' \
2>&1 | tee ./log/train.log

@ -1,5 +1,9 @@
IN_MANIFESTS="../datasets/manifest.train ../datasets/manifest.dev ../datasets/manifest.test" #! /usr/bin/bash
OUT_MANIFESTS="./cloud.manifest.train ./cloud.manifest.dev ./cloud.manifest.test"
mkdir cloud_manifests
IN_MANIFESTS="../data/librispeech/manifest.train ../data/librispeech/manifest.dev-clean ../data/librispeech/manifest.test-clean"
OUT_MANIFESTS="cloud_manifests/cloud.manifest.train cloud_manifests/cloud.manifest.dev cloud_manifests/cloud.manifest.test"
CLOUD_DATA_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/data/librispeech" CLOUD_DATA_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/data/librispeech"
NUM_SHARDS=50 NUM_SHARDS=50
@ -14,4 +18,5 @@ then
echo "Upload Data Failed!" echo "Upload Data Failed!"
exit 1 exit 1
fi fi
echo "All Done." echo "All Done."

@ -4,23 +4,22 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
from data_utils.augmentor.base import AugmentorBase from data_utils.augmentor.base import AugmentorBase
from data_utils import utils from data_utils.utility import read_manifest
from data_utils.audio import AudioSegment from data_utils.audio import AudioSegment
class ImpulseResponseAugmentor(AugmentorBase): class ImpulseResponseAugmentor(AugmentorBase):
"""Augmentation model for adding impulse response effect. """Augmentation model for adding impulse response effect.
:param rng: Random generator object. :param rng: Random generator object.
:type rng: random.Random :type rng: random.Random
:param impulse_manifest_path: Manifest path for impulse audio data. :param impulse_manifest_path: Manifest path for impulse audio data.
:type impulse_manifest_path: basestring :type impulse_manifest_path: basestring
""" """
def __init__(self, rng, impulse_manifest_path): def __init__(self, rng, impulse_manifest_path):
self._rng = rng self._rng = rng
self._impulse_manifest = utils.read_manifest( self._impulse_manifest = read_manifest(impulse_manifest_path)
manifest_path=impulse_manifest_path)
def transform_audio(self, audio_segment): def transform_audio(self, audio_segment):
"""Add impulse response effect. """Add impulse response effect.

@ -4,13 +4,13 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
from data_utils.augmentor.base import AugmentorBase from data_utils.augmentor.base import AugmentorBase
from data_utils import utils from data_utils.utility import read_manifest
from data_utils.audio import AudioSegment from data_utils.audio import AudioSegment
class NoisePerturbAugmentor(AugmentorBase): class NoisePerturbAugmentor(AugmentorBase):
"""Augmentation model for adding background noise. """Augmentation model for adding background noise.
:param rng: Random generator object. :param rng: Random generator object.
:type rng: random.Random :type rng: random.Random
:param min_snr_dB: Minimal signal noise ratio, in decibels. :param min_snr_dB: Minimal signal noise ratio, in decibels.
@ -18,15 +18,14 @@ class NoisePerturbAugmentor(AugmentorBase):
:param max_snr_dB: Maximal signal noise ratio, in decibels. :param max_snr_dB: Maximal signal noise ratio, in decibels.
:type max_snr_dB: float :type max_snr_dB: float
:param noise_manifest_path: Manifest path for noise audio data. :param noise_manifest_path: Manifest path for noise audio data.
:type noise_manifest_path: basestring :type noise_manifest_path: basestring
""" """
def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path): def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path):
self._min_snr_dB = min_snr_dB self._min_snr_dB = min_snr_dB
self._max_snr_dB = max_snr_dB self._max_snr_dB = max_snr_dB
self._rng = rng self._rng = rng
self._noise_manifest = utils.read_manifest( self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
manifest_path=noise_manifest_path)
def transform_audio(self, audio_segment): def transform_audio(self, audio_segment):
"""Add background noise audio. """Add background noise audio.

@ -11,7 +11,7 @@ import multiprocessing
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
from threading import local from threading import local
from data_utils import utils from data_utils.utility import read_manifest
from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.augmentor.augmentation import AugmentationPipeline
from data_utils.featurizer.speech_featurizer import SpeechFeaturizer from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
from data_utils.speech import SpeechSegment from data_utils.speech import SpeechSegment
@ -85,9 +85,9 @@ class DataGenerator(object):
self._rng = random.Random(random_seed) self._rng = random.Random(random_seed)
self._epoch = 0 self._epoch = 0
# for caching tar files info # for caching tar files info
self.local_data = local() self._local_data = local()
self.local_data.tar2info = {} self._local_data.tar2info = {}
self.local_data.tar2object = {} self._local_data.tar2object = {}
def process_utterance(self, filename, transcript): def process_utterance(self, filename, transcript):
"""Load, augment, featurize and normalize for speech data. """Load, augment, featurize and normalize for speech data.
@ -159,7 +159,7 @@ class DataGenerator(object):
def batch_reader(): def batch_reader():
# read manifest # read manifest
manifest = utils.read_manifest( manifest = read_manifest(
manifest_path=manifest_path, manifest_path=manifest_path,
max_duration=self._max_duration, max_duration=self._max_duration,
min_duration=self._min_duration) min_duration=self._min_duration)
@ -240,16 +240,16 @@ class DataGenerator(object):
""" """
if file.startswith('tar:'): if file.startswith('tar:'):
tarpath, filename = file.split(':', 1)[1].split('#', 1) tarpath, filename = file.split(':', 1)[1].split('#', 1)
if 'tar2info' not in self.local_data.__dict__: if 'tar2info' not in self._local_data.__dict__:
self.local_data.tar2info = {} self._local_data.tar2info = {}
if 'tar2object' not in self.local_data.__dict__: if 'tar2object' not in self._local_data.__dict__:
self.local_data.tar2object = {} self._local_data.tar2object = {}
if tarpath not in self.local_data.tar2info: if tarpath not in self._local_data.tar2info:
object, infoes = self._parse_tar(tarpath) object, infoes = self._parse_tar(tarpath)
self.local_data.tar2info[tarpath] = infoes self._local_data.tar2info[tarpath] = infoes
self.local_data.tar2object[tarpath] = object self._local_data.tar2object[tarpath] = object
return self.local_data.tar2object[tarpath].extractfile( return self._local_data.tar2object[tarpath].extractfile(
self.local_data.tar2info[tarpath][filename]) self._local_data.tar2info[tarpath][filename])
else: else:
return open(file, 'r') return open(file, 'r')

@ -4,7 +4,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import numpy as np import numpy as np
from data_utils import utils from data_utils.utility import read_manifest
from data_utils.audio import AudioSegment from data_utils.audio import AudioSegment
from python_speech_features import mfcc from python_speech_features import mfcc
from python_speech_features import delta from python_speech_features import delta

@ -5,7 +5,7 @@ from __future__ import print_function
import numpy as np import numpy as np
import random import random
import data_utils.utils as utils from data_utils.utility import read_manifest
from data_utils.audio import AudioSegment from data_utils.audio import AudioSegment
@ -75,7 +75,7 @@ class FeatureNormalizer(object):
def _compute_mean_std(self, manifest_path, featurize_func, num_samples): def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
"""Compute mean and std from randomly sampled instances.""" """Compute mean and std from randomly sampled instances."""
manifest = utils.read_manifest(manifest_path) manifest = read_manifest(manifest_path)
sampled_manifest = self._rng.sample(manifest, num_samples) sampled_manifest = self._rng.sample(manifest, num_samples)
features = [] features = []
for instance in sampled_manifest: for instance in sampled_manifest:

@ -1,13 +0,0 @@
cd librispeech
python librispeech.py
if [ $? -ne 0 ]; then
echo "Prepare LibriSpeech failed. Terminated."
exit 1
fi
cd -
cat librispeech/manifest.train* | shuf > manifest.train
cat librispeech/manifest.dev-clean > manifest.dev
cat librispeech/manifest.test-clean > manifest.test
echo "All done."

@ -1,10 +0,0 @@
cd noise
python chime3_background.py
if [ $? -ne 0 ]; then
echo "Prepare CHiME3 background noise failed. Terminated."
exit 1
fi
cd -
cat noise/manifest.* > manifest.noise
echo "All done."

@ -0,0 +1,19 @@
"""Set up paths for DS2"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os.path
import sys
def add_path(path):
if path not in sys.path:
sys.path.insert(0, path)
this_dir = os.path.dirname(__file__)
# Add project path to PYTHONPATH
proj_path = os.path.join(this_dir, '..')
add_path(proj_path)

@ -3,111 +3,64 @@ import os
import time import time
import random import random
import argparse import argparse
import distutils.util import functools
from time import gmtime, strftime from time import gmtime, strftime
import SocketServer import SocketServer
import struct import struct
import wave import wave
import paddle.v2 as paddle import paddle.v2 as paddle
from utils import print_arguments import _init_paths
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import DeepSpeech2Model from models.model import DeepSpeech2Model
from data_utils.utils import read_manifest from data_utils.utils import read_manifest
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( add_arg = functools.partial(add_arguments, argparser=parser)
"--host_ip", # yapf: disable
default="localhost", add_arg('host_port', int, 8086, "Server's IP port.")
type=str, add_arg('beam_size', int, 500, "Beam search width.")
help="Server IP address. (default: %(default)s)") add_arg('num_conv_layers', int, 2, "# of convolution layers.")
parser.add_argument( add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
"--host_port", add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
default=8086, add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
type=int, add_arg('beta', float, 0.25, "Coef of WC for beam search.")
help="Server Port. (default: %(default)s)") add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
parser.add_argument( add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
"--speech_save_dir", add_arg('use_gpu', bool, True, "Use GPU or not.")
default="demo_cache", add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
type=str, "bi-directional RNNs. Not for GRU.")
help="Directory for saving demo speech. (default: %(default)s)") add_arg('host_ip', str,
parser.add_argument( 'localhost',
"--vocab_filepath", "Server's IP address.")
default='datasets/vocab/eng_vocab.txt', add_arg('speech_save_dir', str,
type=str, 'demo_cache',
help="Vocabulary filepath. (default: %(default)s)") "Directory to save demo audios.")
parser.add_argument( add_arg('warmup_manifest', str,
"--mean_std_filepath", 'data/librispeech/manifest.test-clean',
default='mean_std.npz', "Filepath of manifest to warm up.")
type=str, add_arg('mean_std_path', str,
help="Manifest path for normalizer. (default: %(default)s)") 'data/librispeech/mean_std.npz',
parser.add_argument( "Filepath of normalizer's mean & std.")
"--warmup_manifest_path", add_arg('vocab_path', str,
default='datasets/manifest.test', 'data/librispeech/eng_vocab.txt',
type=str, "Filepath of vocabulary.")
help="Manifest path for warmup test. (default: %(default)s)") add_arg('model_path', str,
parser.add_argument( './checkpoints/params.latest.tar.gz',
"--specgram_type", "If None, the training starts from scratch, "
default='linear', "otherwise, it resumes from the pre-trained model.")
type=str, add_arg('lang_model_path', str,
help="Feature type of audio data: 'linear' (power spectrum)" 'lm/data/common_crawl_00.prune01111.trie.klm',
" or 'mfcc'. (default: %(default)s)") "Filepath for language model.")
parser.add_argument( add_arg('decoding_method', str,
"--num_conv_layers", 'ctc_beam_search',
default=2, "Decoding method. Options: ctc_beam_search, ctc_greedy",
type=int, choices = ['ctc_beam_search', 'ctc_greedy'])
help="Convolution layer number. (default: %(default)s)") add_arg('specgram_type', str,
parser.add_argument( 'linear',
"--num_rnn_layers", "Audio feature type. Options: linear, mfcc.",
default=3, choices=['linear', 'mfcc'])
type=int, # yapf: disable
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=512,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--decode_method",
default='beam_search',
type=str,
help="Method for ctc decoding: best_path or beam_search. "
"(default: %(default)s)")
parser.add_argument(
"--beam_size",
default=100,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha",
default=0.36,
type=float,
help="Parameter associated with language model. (default: %(default)f)")
parser.add_argument(
"--beta",
default=0.25,
type=float,
help="Parameter associated with word count. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
args = parser.parse_args() args = parser.parse_args()
@ -188,8 +141,8 @@ def start_server():
"""Start the ASR server""" """Start the ASR server"""
# prepare data generator # prepare data generator
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=1) num_threads=1)
@ -199,20 +152,22 @@ def start_server():
num_conv_layers=args.num_conv_layers, num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
pretrained_model_path=args.model_filepath) use_gru=args.use_gru,
pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights)
# prepare ASR inference handler # prepare ASR inference handler
def file_to_transcript(filename): def file_to_transcript(filename):
feature = data_generator.process_utterance(filename, "") feature = data_generator.process_utterance(filename, "")
result_transcript = ds2_model.infer_batch( result_transcript = ds2_model.infer_batch(
infer_data=[feature], infer_data=[feature],
decode_method=args.decode_method, decoding_method=args.decoding_method,
beam_alpha=args.alpha, beam_alpha=args.alpha,
beam_beta=args.beta, beam_beta=args.beta,
beam_size=args.beam_size, beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob, cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list, vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path, language_model_path=args.lang_model_path,
num_processes=1) num_processes=1)
return result_transcript[0] return result_transcript[0]
@ -221,7 +176,7 @@ def start_server():
print('Warming up ...') print('Warming up ...')
warm_up_test( warm_up_test(
audio_process_handler=file_to_transcript, audio_process_handler=file_to_transcript,
manifest_path=args.warmup_manifest_path, manifest_path=args.warmup_manifest,
num_test_cases=3) num_test_cases=3)
print('-----------------------------------------------------------') print('-----------------------------------------------------------')

@ -1,180 +0,0 @@
"""Evaluation for DeepSpeech2 model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import distutils.util
import argparse
import multiprocessing
import paddle.v2 as paddle
from data_utils.data import DataGenerator
from model import DeepSpeech2Model
from error_rate import wer, cer
import utils
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--batch_size",
default=128,
type=int,
help="Minibatch size for evaluation. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--num_conv_layers",
default=2,
type=int,
help="Convolution layer number. (default: %(default)s)")
parser.add_argument(
"--num_rnn_layers",
default=3,
type=int,
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=512,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--num_threads_data",
default=multiprocessing.cpu_count() // 2,
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument(
"--num_processes_beam_search",
default=multiprocessing.cpu_count() // 2,
type=int,
help="Number of cpu processes for beam search. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--decode_method",
default='beam_search',
type=str,
help="Method for ctc decoding, best_path or beam_search. "
"(default: %(default)s)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha",
default=0.36,
type=float,
help="Parameter associated with language model. (default: %(default)f)")
parser.add_argument(
"--beta",
default=0.25,
type=float,
help="Parameter associated with word count. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
parser.add_argument(
"--beam_size",
default=500,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--specgram_type",
default='linear',
type=str,
help="Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)")
parser.add_argument(
"--decode_manifest_path",
default='datasets/manifest.test',
type=str,
help="Manifest path for decoding. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--error_rate_type",
default='wer',
choices=['wer', 'cer'],
type=str,
help="Error rate type for evaluation. 'wer' for word error rate and 'cer' "
"for character error rate. "
"(default: %(default)s)")
args = parser.parse_args()
def evaluate():
"""Evaluate on whole test data for DeepSpeech2."""
data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath,
augmentation_config='{}',
specgram_type=args.specgram_type,
num_threads=args.num_threads_data)
batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path,
batch_size=args.batch_size,
min_batch_size=1,
sortagrad=False,
shuffle_method=None)
ds2_model = DeepSpeech2Model(
vocab_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
pretrained_model_path=args.model_filepath)
error_rate_func = cer if args.error_rate_type == 'cer' else wer
error_sum, num_ins = 0.0, 0
for infer_data in batch_reader():
result_transcripts = ds2_model.infer_batch(
infer_data=infer_data,
decode_method=args.decode_method,
beam_alpha=args.alpha,
beam_beta=args.beta,
beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path,
num_processes=args.num_processes_beam_search)
target_transcripts = [
''.join([data_generator.vocab_list[token] for token in transcript])
for _, transcript in infer_data
]
for target, result in zip(target_transcripts, result_transcripts):
error_sum += error_rate_func(target, result)
num_ins += 1
print("Error rate [%s] (%d/?) = %f" %
(args.error_rate_type, num_ins, error_sum / num_ins))
print("Final error rate [%s] (%d/%d) = %f" %
(args.error_rate_type, num_ins, num_ins, error_sum / num_ins))
def main():
utils.print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
evaluate()
if __name__ == '__main__':
main()

@ -0,0 +1,28 @@
#! /usr/bin/bash
pushd ../..
CUDA_VISIBLE_DEVICES=0 \
python -u infer.py \
--num_samples=10 \
--trainer_count=1 \
--beam_size=500 \
--num_proc_bsearch=12 \
--num_proc_data=12 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--infer_manifest='data/librispeech/manifest.dev-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'

@ -0,0 +1,32 @@
#! /usr/bin/bash
pushd ../..
# download data, generate manifests
python data/librispeech/librispeech.py \
--manifest_prefix='data/librispeech/manifest' \
--full_download='True' \
--target_dir='~/.cache/paddle/dataset/speech/Libri'
if [ $? -ne 0 ]; then
echo "Prepare LibriSpeech failed. Terminated."
exit 1
fi
#cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
# compute mean and stddev for normalizer
python tools/compute_mean_std.py \
--manifest_path='data/librispeech/manifest.train' \
--num_samples=2000 \
--specgram_type='linear' \
--output_path='data/librispeech/mean_std.npz'
if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated."
exit 1
fi
echo "LibriSpeech Data preparation done."

@ -0,0 +1,28 @@
#! /usr/bin/bash
pushd ../..
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u evaluate.py \
--batch_size=128 \
--trainer_count=8 \
--beam_size=500 \
--num_proc_bsearch=12 \
--num_proc_data=12 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--test_manifest='data/librispeech/manifest.test-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'

@ -0,0 +1,30 @@
#! /usr/bin/bash
pushd ../..
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u train.py \
--batch_size=256 \
--trainer_count=8 \
--num_passes=200 \
--num_proc_data=12 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--num_iter_print=100 \
--learning_rate=5e-4 \
--max_duration=27.0 \
--min_duration=0.0 \
--use_sortagrad=True \
--use_gru=False \
--use_gpu=True \
--is_local=True \
--share_rnn_weights=True \
--train_manifest='data/librispeech/manifest.train' \
--dev_manifest='data/librispeech/manifest.dev' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--output_model_dir='./checkpoints' \
--augment_conf_path='conf/augmentation.config' \
--specgram_type='linear' \
--shuffle_method='batch_shuffle_clipped'

@ -0,0 +1,30 @@
#! /usr/bin/bash
pushd ../..
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u tools/tune.py \
--num_samples=100 \
--trainer_count=8 \
--beam_size=500 \
--num_proc_bsearch=12 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--num_alphas=14 \
--num_betas=20 \
--alpha_from=0.1 \
--alpha_to=0.36 \
--beta_from=0.05 \
--beta_to=1.0 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--tune_manifest='data/librispeech/manifest.dev-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
--error_rate_type='wer' \
--specgram_type='linear'

@ -4,134 +4,72 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse import argparse
import distutils.util import functools
import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import DeepSpeech2Model from models.model import DeepSpeech2Model
from error_rate import wer, cer from utils.error_rate import wer, cer
import utils from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( add_arg = functools.partial(add_arguments, argparser=parser)
"--num_samples", # yapf: disable
default=10, add_arg('num_samples', int, 10, "# of samples to infer.")
type=int, add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
help="Number of samples for inference. (default: %(default)s)") add_arg('beam_size', int, 500, "Beam search width.")
parser.add_argument( add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.")
"--num_conv_layers", add_arg('num_conv_layers', int, 2, "# of convolution layers.")
default=2, add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
type=int, add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
help="Convolution layer number. (default: %(default)s)") add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
parser.add_argument( add_arg('beta', float, 0.25, "Coef of WC for beam search.")
"--num_rnn_layers", add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
default=3, add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
type=int, add_arg('use_gpu', bool, True, "Use GPU or not.")
help="RNN layer number. (default: %(default)s)") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
parser.add_argument( "bi-directional RNNs. Not for GRU.")
"--rnn_layer_size", add_arg('infer_manifest', str,
default=512, 'data/librispeech/manifest.dev-clean',
type=int, "Filepath of manifest to infer.")
help="RNN layer cell number. (default: %(default)s)") add_arg('mean_std_path', str,
parser.add_argument( 'data/librispeech/mean_std.npz',
"--use_gpu", "Filepath of normalizer's mean & std.")
default=True, add_arg('vocab_path', str,
type=distutils.util.strtobool, 'data/librispeech/eng_vocab.txt',
help="Use gpu or not. (default: %(default)s)") "Filepath of vocabulary.")
parser.add_argument( add_arg('lang_model_path', str,
"--num_threads_data", 'lm/data/common_crawl_00.prune01111.trie.klm',
default=1, "Filepath for language model.")
type=int, add_arg('model_path', str,
help="Number of cpu threads for preprocessing data. (default: %(default)s)") './checkpoints/params.latest.tar.gz',
parser.add_argument( "If None, the training starts from scratch, "
"--num_processes_beam_search", "otherwise, it resumes from the pre-trained model.")
default=multiprocessing.cpu_count() // 2, add_arg('decoding_method', str,
type=int, 'ctc_beam_search',
help="Number of cpu processes for beam search. (default: %(default)s)") "Decoding method. Options: ctc_beam_search, ctc_greedy",
parser.add_argument( choices = ['ctc_beam_search', 'ctc_greedy'])
"--specgram_type", add_arg('error_rate_type', str,
default='linear', 'wer',
type=str, "Error rate type for evaluation.",
help="Feature type of audio data: 'linear' (power spectrum)" choices=['wer', 'cer'])
" or 'mfcc'. (default: %(default)s)") add_arg('specgram_type', str,
parser.add_argument( 'linear',
"--trainer_count", "Audio feature type. Options: linear, mfcc.",
default=8, choices=['linear', 'mfcc'])
type=int, # yapf: disable
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--decode_manifest_path",
default='datasets/manifest.test',
type=str,
help="Manifest path for decoding. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--decode_method",
default='beam_search',
type=str,
help="Method for ctc decoding: best_path or beam_search. "
"(default: %(default)s)")
parser.add_argument(
"--beam_size",
default=500,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha",
default=0.36,
type=float,
help="Parameter associated with language model. (default: %(default)f)")
parser.add_argument(
"--beta",
default=0.25,
type=float,
help="Parameter associated with word count. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
parser.add_argument(
"--error_rate_type",
default='wer',
choices=['wer', 'cer'],
type=str,
help="Error rate type for evaluation. 'wer' for word error rate and 'cer' "
"for character error rate. "
"(default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
def infer(): def infer():
"""Inference for DeepSpeech2.""" """Inference for DeepSpeech2."""
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=1)
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path, manifest_path=args.infer_manifest,
batch_size=args.num_samples, batch_size=args.num_samples,
min_batch_size=1, min_batch_size=1,
sortagrad=False, sortagrad=False,
@ -143,17 +81,19 @@ def infer():
num_conv_layers=args.num_conv_layers, num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
pretrained_model_path=args.model_filepath) use_gru=args.use_gru,
pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights)
result_transcripts = ds2_model.infer_batch( result_transcripts = ds2_model.infer_batch(
infer_data=infer_data, infer_data=infer_data,
decode_method=args.decode_method, decoding_method=args.decoding_method,
beam_alpha=args.alpha, beam_alpha=args.alpha,
beam_beta=args.beta, beam_beta=args.beta,
beam_size=args.beam_size, beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob, cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list, vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path, language_model_path=args.lang_model_path,
num_processes=args.num_processes_beam_search) num_processes=args.num_proc_bsearch)
error_rate_func = cer if args.error_rate_type == 'cer' else wer error_rate_func = cer if args.error_rate_type == 'cer' else wer
target_transcripts = [ target_transcripts = [
@ -168,7 +108,7 @@ def infer():
def main(): def main():
utils.print_arguments(args) print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
infer() infer()

@ -1,177 +0,0 @@
"""Contains DeepSpeech2 layers."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.v2 as paddle
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
padding, act):
"""Convolution layer with batch normalization.
:param input: Input layer.
:type input: LayerOutput
:param filter_size: The x dimension of a filter kernel. Or input a tuple for
two image dimension.
:type filter_size: int|tuple|list
:param num_channels_in: Number of input channels.
:type num_channels_in: int
:type num_channels_out: Number of output channels.
:type num_channels_in: out
:param padding: The x dimension of the padding. Or input a tuple for two
image dimension.
:type padding: int|tuple|list
:param act: Activation type.
:type act: BaseActivation
:return: Batch norm layer after convolution layer.
:rtype: LayerOutput
"""
conv_layer = paddle.layer.img_conv(
input=input,
filter_size=filter_size,
num_channels=num_channels_in,
num_filters=num_channels_out,
stride=stride,
padding=padding,
act=paddle.activation.Linear(),
bias_attr=False)
return paddle.layer.batch_norm(input=conv_layer, act=act)
def bidirectional_simple_rnn_bn_layer(name, input, size, act):
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param name: Name of the layer.
:type name: string
:param input: Input layer.
:type input: LayerOutput
:param size: Number of RNN cells.
:type size: int
:param act: Activation type.
:type act: BaseActivation
:return: Bidirectional simple rnn layer.
:rtype: LayerOutput
"""
# input-hidden weights shared across bi-direcitonal rnn.
input_proj = paddle.layer.fc(
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
# batch norm is only performed on input-state projection
input_proj_bn = paddle.layer.batch_norm(
input=input_proj, act=paddle.activation.Linear())
# forward and backward in time
forward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn, act=act, reverse=False)
backward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn, act=act, reverse=True)
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
def conv_group(input, num_stacks):
"""Convolution group with stacked convolution layers.
:param input: Input layer.
:type input: LayerOutput
:param num_stacks: Number of stacked convolution layers.
:type num_stacks: int
:return: Output layer of the convolution group.
:rtype: LayerOutput
"""
conv = conv_bn_layer(
input=input,
filter_size=(11, 41),
num_channels_in=1,
num_channels_out=32,
stride=(3, 2),
padding=(5, 20),
act=paddle.activation.BRelu())
for i in xrange(num_stacks - 1):
conv = conv_bn_layer(
input=conv,
filter_size=(11, 21),
num_channels_in=32,
num_channels_out=32,
stride=(1, 2),
padding=(5, 10),
act=paddle.activation.BRelu())
output_num_channels = 32
output_height = 160 // pow(2, num_stacks) + 1
return conv, output_num_channels, output_height
def rnn_group(input, size, num_stacks):
"""RNN group with stacked bidirectional simple RNN layers.
:param input: Input layer.
:type input: LayerOutput
:param size: Number of RNN cells in each layer.
:type size: int
:param num_stacks: Number of stacked rnn layers.
:type num_stacks: int
:return: Output layer of the RNN group.
:rtype: LayerOutput
"""
output = input
for i in xrange(num_stacks):
output = bidirectional_simple_rnn_bn_layer(
name=str(i), input=output, size=size, act=paddle.activation.BRelu())
return output
def deep_speech2(audio_data,
text_data,
dict_size,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=256):
"""
The whole DeepSpeech2 model structure (a simplified version).
:param audio_data: Audio spectrogram data layer.
:type audio_data: LayerOutput
:param text_data: Transcription text data layer.
:type text_data: LayerOutput
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int
:return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
"""
# convolution group
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
input=audio_data, num_stacks=num_conv_layers)
# convert data form convolution feature map to sequence of vectors
conv2seq = paddle.layer.block_expand(
input=conv_group_output,
num_channels=conv_group_num_channels,
stride_x=1,
stride_y=1,
block_x=1,
block_y=conv_group_height)
# rnn group
rnn_group_output = rnn_group(
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
fc = paddle.layer.fc(
input=rnn_group_output,
size=dict_size + 1,
act=paddle.activation.Linear(),
bias_attr=True)
# probability distribution with softmax
log_probs = paddle.layer.mixed(
input=paddle.layer.identity_projection(input=fc),
act=paddle.activation.Softmax())
# ctc cost
ctc_loss = paddle.layer.warp_ctc(
input=fc,
label=text_data,
size=dict_size + 1,
blank=dict_size,
norm_by_times=True)
return log_probs, ctc_loss

@ -9,8 +9,9 @@ from math import log
import multiprocessing import multiprocessing
def ctc_best_path_decoder(probs_seq, vocabulary): def ctc_greedy_decoder(probs_seq, vocabulary):
"""Best path decoder, also called argmax decoder or greedy decoder. """CTC greedy (best path) decoder.
Path consisting of the most probable tokens are further post-processed to Path consisting of the most probable tokens are further post-processed to
remove consecutive repetitions and all blanks. remove consecutive repetitions and all blanks.
@ -45,10 +46,12 @@ def ctc_beam_search_decoder(probs_seq,
cutoff_prob=1.0, cutoff_prob=1.0,
ext_scoring_func=None, ext_scoring_func=None,
nproc=False): nproc=False):
"""Beam search decoder for CTC-trained network. It utilizes beam search """CTC Beam search decoder.
to approximately select top best decoding labels and returning results
in the descending order. The implementation is based on Prefix It utilizes beam search to approximately select top best decoding
Beam Search (https://arxiv.org/abs/1408.2873), and the unclear part is labels and returning results in the descending order.
The implementation is based on Prefix Beam Search
(https://arxiv.org/abs/1408.2873), and the unclear part is
redesigned. Two important modifications: 1) in the iterative computation redesigned. Two important modifications: 1) in the iterative computation
of probabilities, the assignment operation is changed to accumulation for of probabilities, the assignment operation is changed to accumulation for
one prefix may comes from different paths; 2) the if condition "if l^+ not one prefix may comes from different paths; 2) the if condition "if l^+ not

@ -7,10 +7,10 @@ import sys
import os import os
import time import time
import gzip import gzip
from decoder import *
from lm.lm_scorer import LmScorer
import paddle.v2 as paddle import paddle.v2 as paddle
from layer import * from lm.lm_scorer import LmScorer
from models.decoder import ctc_greedy_decoder, ctc_beam_search_decoder
from models.network import deep_speech_v2_network
class DeepSpeech2Model(object): class DeepSpeech2Model(object):
@ -27,12 +27,17 @@ class DeepSpeech2Model(object):
:param pretrained_model_path: Pretrained model path. If None, will train :param pretrained_model_path: Pretrained model path. If None, will train
from stratch. from stratch.
:type pretrained_model_path: basestring|None :type pretrained_model_path: basestring|None
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.Notice that
for GRU, weight sharing is not supported.
:type share_rnn_weights: bool
""" """
def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, def __init__(self, vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size, pretrained_model_path): rnn_layer_size, use_gru, pretrained_model_path,
share_rnn_weights):
self._create_network(vocab_size, num_conv_layers, num_rnn_layers, self._create_network(vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size) rnn_layer_size, use_gru, share_rnn_weights)
self._create_parameters(pretrained_model_path) self._create_parameters(pretrained_model_path)
self._inferer = None self._inferer = None
self._loss_inferer = None self._loss_inferer = None
@ -141,7 +146,7 @@ class DeepSpeech2Model(object):
# run inference # run inference
return self._loss_inferer.infer(input=infer_data) return self._loss_inferer.infer(input=infer_data)
def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta, def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta,
beam_size, cutoff_prob, vocab_list, language_model_path, beam_size, cutoff_prob, vocab_list, language_model_path,
num_processes): num_processes):
"""Model inference. Infer the transcription for a batch of speech """Model inference. Infer the transcription for a batch of speech
@ -151,9 +156,9 @@ class DeepSpeech2Model(object):
consisting of a tuple of audio features and consisting of a tuple of audio features and
transcription text (empty string). transcription text (empty string).
:type infer_data: list :type infer_data: list
:param decode_method: Decoding method name, 'best_path' or :param decoding_method: Decoding method name, 'ctc_greedy' or
'beam search'. 'ctc_beam_search'.
:param decode_method: string :param decoding_method: string
:param beam_alpha: Parameter associated with language model. :param beam_alpha: Parameter associated with language model.
:type beam_alpha: float :type beam_alpha: float
:param beam_beta: Parameter associated with word count. :param beam_beta: Parameter associated with word count.
@ -185,13 +190,13 @@ class DeepSpeech2Model(object):
] ]
# run decoder # run decoder
results = [] results = []
if decode_method == "best_path": if decoding_method == "ctc_greedy":
# best path decode # best path decode
for i, probs in enumerate(probs_split): for i, probs in enumerate(probs_split):
output_transcription = ctc_best_path_decoder( output_transcription = ctc_greedy_decoder(
probs_seq=probs, vocabulary=vocab_list) probs_seq=probs, vocabulary=vocab_list)
results.append(output_transcription) results.append(output_transcription)
elif decode_method == "beam_search": elif decoding_method == "ctc_beam_search":
# initialize external scorer # initialize external scorer
if self._ext_scorer == None: if self._ext_scorer == None:
self._ext_scorer = LmScorer(beam_alpha, beam_beta, self._ext_scorer = LmScorer(beam_alpha, beam_beta,
@ -200,7 +205,6 @@ class DeepSpeech2Model(object):
else: else:
self._ext_scorer.reset_params(beam_alpha, beam_beta) self._ext_scorer.reset_params(beam_alpha, beam_beta)
assert self._loaded_lm_path == language_model_path assert self._loaded_lm_path == language_model_path
# beam search decode # beam search decode
beam_search_results = ctc_beam_search_decoder_batch( beam_search_results = ctc_beam_search_decoder_batch(
probs_split=probs_split, probs_split=probs_split,
@ -214,7 +218,7 @@ class DeepSpeech2Model(object):
results = [result[0][1] for result in beam_search_results] results = [result[0][1] for result in beam_search_results]
else: else:
raise ValueError("Decoding method [%s] is not supported." % raise ValueError("Decoding method [%s] is not supported." %
decode_method) decoding_method)
return results return results
def _create_parameters(self, model_path=None): def _create_parameters(self, model_path=None):
@ -226,7 +230,7 @@ class DeepSpeech2Model(object):
gzip.open(model_path)) gzip.open(model_path))
def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size): rnn_layer_size, use_gru, share_rnn_weights):
"""Create data layers and model network.""" """Create data layers and model network."""
# paddle.data_type.dense_array is used for variable batch input. # paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape # The size 161 * 161 is only an placeholder value and the real shape
@ -237,10 +241,12 @@ class DeepSpeech2Model(object):
text_data = paddle.layer.data( text_data = paddle.layer.data(
name="transcript_text", name="transcript_text",
type=paddle.data_type.integer_value_sequence(vocab_size)) type=paddle.data_type.integer_value_sequence(vocab_size))
self._log_probs, self._loss = deep_speech2( self._log_probs, self._loss = deep_speech_v2_network(
audio_data=audio_data, audio_data=audio_data,
text_data=text_data, text_data=text_data,
dict_size=vocab_size, dict_size=vocab_size,
num_conv_layers=num_conv_layers, num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers, num_rnn_layers=num_rnn_layers,
rnn_size=rnn_layer_size) rnn_size=rnn_layer_size,
use_gru=use_gru,
share_rnn_weights=share_rnn_weights)

@ -0,0 +1,274 @@
"""Contains DeepSpeech2 layers and networks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.v2 as paddle
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
padding, act):
"""Convolution layer with batch normalization.
:param input: Input layer.
:type input: LayerOutput
:param filter_size: The x dimension of a filter kernel. Or input a tuple for
two image dimension.
:type filter_size: int|tuple|list
:param num_channels_in: Number of input channels.
:type num_channels_in: int
:type num_channels_out: Number of output channels.
:type num_channels_in: out
:param padding: The x dimension of the padding. Or input a tuple for two
image dimension.
:type padding: int|tuple|list
:param act: Activation type.
:type act: BaseActivation
:return: Batch norm layer after convolution layer.
:rtype: LayerOutput
"""
conv_layer = paddle.layer.img_conv(
input=input,
filter_size=filter_size,
num_channels=num_channels_in,
num_filters=num_channels_out,
stride=stride,
padding=padding,
act=paddle.activation.Linear(),
bias_attr=False)
return paddle.layer.batch_norm(input=conv_layer, act=act)
def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param name: Name of the layer.
:type name: string
:param input: Input layer.
:type input: LayerOutput
:param size: Number of RNN cells.
:type size: int
:param act: Activation type.
:type act: BaseActivation
:param share_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
:type share_weights: bool
:return: Bidirectional simple rnn layer.
:rtype: LayerOutput
"""
if share_weights:
# input-hidden weights shared between bi-direcitonal rnn.
input_proj = paddle.layer.fc(
input=input,
size=size,
act=paddle.activation.Linear(),
bias_attr=False)
# batch norm is only performed on input-state projection
input_proj_bn = paddle.layer.batch_norm(
input=input_proj, act=paddle.activation.Linear())
# forward and backward in time
forward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn, act=act, reverse=False)
backward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn, act=act, reverse=True)
else:
input_proj_forward = paddle.layer.fc(
input=input,
size=size,
act=paddle.activation.Linear(),
bias_attr=False)
input_proj_backward = paddle.layer.fc(
input=input,
size=size,
act=paddle.activation.Linear(),
bias_attr=False)
# batch norm is only performed on input-state projection
input_proj_bn_forward = paddle.layer.batch_norm(
input=input_proj_forward, act=paddle.activation.Linear())
input_proj_bn_backward = paddle.layer.batch_norm(
input=input_proj_backward, act=paddle.activation.Linear())
# forward and backward in time
forward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn_forward, act=act, reverse=False)
backward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn_backward, act=act, reverse=True)
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
def bidirectional_gru_bn_layer(name, input, size, act):
"""Bidirectonal gru layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param name: Name of the layer.
:type name: string
:param input: Input layer.
:type input: LayerOutput
:param size: Number of RNN cells.
:type size: int
:param act: Activation type.
:type act: BaseActivation
:return: Bidirectional simple rnn layer.
:rtype: LayerOutput
"""
input_proj_forward = paddle.layer.fc(
input=input,
size=size * 3,
act=paddle.activation.Linear(),
bias_attr=False)
input_proj_backward = paddle.layer.fc(
input=input,
size=size * 3,
act=paddle.activation.Linear(),
bias_attr=False)
# batch norm is only performed on input-related projections
input_proj_bn_forward = paddle.layer.batch_norm(
input=input_proj_forward, act=paddle.activation.Linear())
input_proj_bn_backward = paddle.layer.batch_norm(
input=input_proj_backward, act=paddle.activation.Linear())
# forward and backward in time
forward_gru = paddle.layer.grumemory(
input=input_proj_bn_forward, act=act, reverse=False)
backward_gru = paddle.layer.grumemory(
input=input_proj_bn_backward, act=act, reverse=True)
return paddle.layer.concat(input=[forward_gru, backward_gru])
def conv_group(input, num_stacks):
"""Convolution group with stacked convolution layers.
:param input: Input layer.
:type input: LayerOutput
:param num_stacks: Number of stacked convolution layers.
:type num_stacks: int
:return: Output layer of the convolution group.
:rtype: LayerOutput
"""
conv = conv_bn_layer(
input=input,
filter_size=(11, 41),
num_channels_in=1,
num_channels_out=32,
stride=(3, 2),
padding=(5, 20),
act=paddle.activation.BRelu())
for i in xrange(num_stacks - 1):
conv = conv_bn_layer(
input=conv,
filter_size=(11, 21),
num_channels_in=32,
num_channels_out=32,
stride=(1, 2),
padding=(5, 10),
act=paddle.activation.BRelu())
output_num_channels = 32
output_height = 160 // pow(2, num_stacks) + 1
return conv, output_num_channels, output_height
def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
"""RNN group with stacked bidirectional simple RNN layers.
:param input: Input layer.
:type input: LayerOutput
:param size: Number of RNN cells in each layer.
:type size: int
:param num_stacks: Number of stacked rnn layers.
:type num_stacks: int
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: Output layer of the RNN group.
:rtype: LayerOutput
"""
output = input
for i in xrange(num_stacks):
if use_gru:
output = bidirectional_gru_bn_layer(
name=str(i),
input=output,
size=size,
act=paddle.activation.Relu())
# BRelu does not support hppl, need to add later. Use Relu instead.
else:
output = bidirectional_simple_rnn_bn_layer(
name=str(i),
input=output,
size=size,
act=paddle.activation.BRelu(),
share_weights=share_rnn_weights)
return output
def deep_speech_v2_network(audio_data,
text_data,
dict_size,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=256,
use_gru=False,
share_rnn_weights=True):
"""The DeepSpeech2 network structure.
:param audio_data: Audio spectrogram data layer.
:type audio_data: LayerOutput
:param text_data: Transcription text data layer.
:type text_data: LayerOutput
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward direction RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
"""
# convolution group
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
input=audio_data, num_stacks=num_conv_layers)
# convert data form convolution feature map to sequence of vectors
conv2seq = paddle.layer.block_expand(
input=conv_group_output,
num_channels=conv_group_num_channels,
stride_x=1,
stride_y=1,
block_x=1,
block_y=conv_group_height)
# rnn group
rnn_group_output = rnn_group(
input=conv2seq,
size=rnn_size,
num_stacks=num_rnn_layers,
use_gru=use_gru,
share_rnn_weights=share_rnn_weights)
fc = paddle.layer.fc(
input=rnn_group_output,
size=dict_size + 1,
act=paddle.activation.Linear(),
bias_attr=True)
# probability distribution with softmax
log_probs = paddle.layer.mixed(
input=paddle.layer.identity_projection(input=fc),
act=paddle.activation.Softmax())
# ctc cost
ctc_loss = paddle.layer.warp_ctc(
input=fc,
label=text_data,
size=dict_size + 1,
blank=dict_size,
norm_by_times=True)
return log_probs, ctc_loss

@ -4,7 +4,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from decoder import * from models import decoder
class TestDecoders(unittest.TestCase): class TestDecoders(unittest.TestCase):
@ -49,19 +49,21 @@ class TestDecoders(unittest.TestCase):
0.15882358, 0.1235788, 0.23376776, 0.20510435, 0.00279306, 0.15882358, 0.1235788, 0.23376776, 0.20510435, 0.00279306,
0.05294827, 0.22298418 0.05294827, 0.22298418
]] ]]
self.best_path_result = ["ac'bdc", "b'da"] self.greedy_result = ["ac'bdc", "b'da"]
self.beam_search_result = ['acdc', "b'a"] self.beam_search_result = ['acdc', "b'a"]
def test_best_path_decoder_1(self): def test_greedy_decoder_1(self):
bst_result = ctc_best_path_decoder(self.probs_seq1, self.vocab_list) bst_result = decoder.ctc_greedy_decoder(self.probs_seq1,
self.assertEqual(bst_result, self.best_path_result[0]) self.vocab_list)
self.assertEqual(bst_result, self.greedy_result[0])
def test_best_path_decoder_2(self): def test_greedy_decoder_2(self):
bst_result = ctc_best_path_decoder(self.probs_seq2, self.vocab_list) bst_result = decoder.ctc_greedy_decoder(self.probs_seq2,
self.assertEqual(bst_result, self.best_path_result[1]) self.vocab_list)
self.assertEqual(bst_result, self.greedy_result[1])
def test_beam_search_decoder_1(self): def test_beam_search_decoder_1(self):
beam_result = ctc_beam_search_decoder( beam_result = decoder.ctc_beam_search_decoder(
probs_seq=self.probs_seq1, probs_seq=self.probs_seq1,
beam_size=self.beam_size, beam_size=self.beam_size,
vocabulary=self.vocab_list, vocabulary=self.vocab_list,
@ -69,7 +71,7 @@ class TestDecoders(unittest.TestCase):
self.assertEqual(beam_result[0][1], self.beam_search_result[0]) self.assertEqual(beam_result[0][1], self.beam_search_result[0])
def test_beam_search_decoder_2(self): def test_beam_search_decoder_2(self):
beam_result = ctc_beam_search_decoder( beam_result = decoder.ctc_beam_search_decoder(
probs_seq=self.probs_seq2, probs_seq=self.probs_seq2,
beam_size=self.beam_size, beam_size=self.beam_size,
vocabulary=self.vocab_list, vocabulary=self.vocab_list,
@ -77,7 +79,7 @@ class TestDecoders(unittest.TestCase):
self.assertEqual(beam_result[0][1], self.beam_search_result[1]) self.assertEqual(beam_result[0][1], self.beam_search_result[1])
def test_beam_search_decoder_batch(self): def test_beam_search_decoder_batch(self):
beam_results = ctc_beam_search_decoder_batch( beam_results = decoder.ctc_beam_search_decoder_batch(
probs_split=[self.probs_seq1, self.probs_seq2], probs_split=[self.probs_seq1, self.probs_seq2],
beam_size=self.beam_size, beam_size=self.beam_size,
vocabulary=self.vocab_list, vocabulary=self.vocab_list,

@ -0,0 +1,121 @@
"""Evaluation for DeepSpeech2 model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import functools
import paddle.v2 as paddle
from data_utils.data import DataGenerator
from models.model import DeepSpeech2Model
from utils.error_rate import wer, cer
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 128, "Minibatch size.")
add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
add_arg('beam_size', int, 500, "Beam search width.")
add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.")
add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.")
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
add_arg('beta', float, 0.25, "Coef of WC for beam search.")
add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
add_arg('use_gpu', bool, True, "Use GPU or not.")
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"bi-directional RNNs. Not for GRU.")
add_arg('test_manifest', str,
'data/librispeech/manifest.test-clean',
"Filepath of manifest to evaluate.")
add_arg('mean_std_path', str,
'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'data/librispeech/eng_vocab.txt',
"Filepath of vocabulary.")
add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz',
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.")
add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.")
add_arg('decoding_method', str,
'ctc_beam_search',
"Decoding method. Options: ctc_beam_search, ctc_greedy",
choices = ['ctc_beam_search', 'ctc_greedy'])
add_arg('error_rate_type', str,
'wer',
"Error rate type for evaluation.",
choices=['wer', 'cer'])
add_arg('specgram_type', str,
'linear',
"Audio feature type. Options: linear, mfcc.",
choices=['linear', 'mfcc'])
# yapf: disable
args = parser.parse_args()
def evaluate():
"""Evaluate on whole test data for DeepSpeech2."""
data_generator = DataGenerator(
vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_path,
augmentation_config='{}',
specgram_type=args.specgram_type,
num_threads=args.num_proc_data)
batch_reader = data_generator.batch_reader_creator(
manifest_path=args.test_manifest,
batch_size=args.batch_size,
min_batch_size=1,
sortagrad=False,
shuffle_method=None)
ds2_model = DeepSpeech2Model(
vocab_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights)
error_rate_func = cer if args.error_rate_type == 'cer' else wer
error_sum, num_ins = 0.0, 0
for infer_data in batch_reader():
result_transcripts = ds2_model.infer_batch(
infer_data=infer_data,
decoding_method=args.decoding_method,
beam_alpha=args.alpha,
beam_beta=args.beta,
beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list,
language_model_path=args.lang_model_path,
num_processes=args.num_proc_bsearch)
target_transcripts = [
''.join([data_generator.vocab_list[token] for token in transcript])
for _, transcript in infer_data
]
for target, result in zip(target_transcripts, result_transcripts):
error_sum += error_rate_func(target, result)
num_ins += 1
print("Error rate [%s] (%d/?) = %f" %
(args.error_rate_type, num_ins, error_sum / num_ins))
print("Final error rate [%s] (%d/%d) = %f" %
(args.error_rate_type, num_ins, num_ins, error_sum / num_ins))
def main():
print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
evaluate()
if __name__ == '__main__':
main()

@ -1,23 +0,0 @@
"""Test Setup."""
import unittest
import numpy as np
import os
class TestSetup(unittest.TestCase):
def test_soundfile(self):
import soundfile as sf
# floating point data is typically limited to the interval [-1.0, 1.0],
# but smaller/larger values are supported as well
data = np.array([[1.75, -1.75], [1.0, -1.0], [0.5, -0.5],
[0.25, -0.25]])
file = 'test.wav'
sf.write(file, data, 44100, format='WAV', subtype='FLOAT')
read, fs = sf.read(file)
self.assertTrue(np.all(read == data))
self.assertEqual(fs, 44100)
os.remove(file)
if __name__ == '__main__':
unittest.main()

@ -7,32 +7,29 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse import argparse
import functools
import codecs import codecs
import json import json
from collections import Counter from collections import Counter
import os.path import os.path
import _init_paths import _init_paths
from data_utils import utils from data_utils.utility import read_manifest
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( add_arg = functools.partial(add_arguments, argparser=parser)
"--manifest_paths", # yapf: disable
type=str, add_arg('count_threshold', int, 0, "Truncation threshold for char counts.")
help="Manifest paths for building vocabulary." add_arg('vocab_path', str,
"You can provide multiple manifest files.", 'datasets/vocab/zh_vocab.txt',
nargs='+', "Filepath to write the vocabulary.")
required=True) add_arg('manifest_paths', str,
parser.add_argument( None,
"--count_threshold", "Filepaths of manifests for building vocabulary. "
default=0, "You can provide multiple manifest files.",
type=int, nargs='+',
help="Characters whose counts are below the threshold will be truncated. " required=True)
"(default: %(default)i)") # yapf: disable
parser.add_argument(
"--vocab_path",
default='datasets/vocab/zh_vocab.txt',
type=str,
help="File path to write the vocabulary. (default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
@ -44,6 +41,8 @@ def count_manifest(counter, manifest_path):
def main(): def main():
print_arguments(args)
counter = Counter() counter = Counter()
for manifest_path in args.manifest_paths: for manifest_path in args.manifest_paths:
count_manifest(counter, manifest_path) count_manifest(counter, manifest_path)

@ -4,48 +4,35 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse import argparse
import functools
import _init_paths import _init_paths
from data_utils.normalizer import FeatureNormalizer from data_utils.normalizer import FeatureNormalizer
from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.augmentor.augmentation import AugmentationPipeline
from data_utils.featurizer.audio_featurizer import AudioFeaturizer from data_utils.featurizer.audio_featurizer import AudioFeaturizer
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(
description='Computing mean and stddev for feature normalizer.') parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( add_arg = functools.partial(add_arguments, argparser=parser)
"--specgram_type", # yapf: disable
default='linear', add_arg('num_samples', int, 2000, "# of samples to for statistics.")
type=str, add_arg('specgram_type', str,
help="Feature type of audio data: 'linear' (power spectrum)" 'linear',
" or 'mfcc'. (default: %(default)s)") "Audio feature type. Options: linear, mfcc.",
parser.add_argument( choices=['linear', 'mfcc'])
"--manifest_path", add_arg('manifest_path', str,
default='datasets/manifest.train', 'datasets/manifest.train',
type=str, "Filepath of manifest to compute normalizer's mean and stddev.")
help="Manifest path for computing normalizer's mean and stddev." add_arg('output_path', str,
"(default: %(default)s)") 'mean_std.npz',
parser.add_argument( "Filepath of write mean and stddev to (.npz).")
"--num_samples", # yapf: disable
default=2000,
type=int,
help="Number of samples for computing mean and stddev. "
"(default: %(default)s)")
parser.add_argument(
"--augmentation_config",
default='{}',
type=str,
help="Augmentation configuration in json-format. "
"(default: %(default)s)")
parser.add_argument(
"--output_file",
default='mean_std.npz',
type=str,
help="Filepath to write mean and std to (.npz)."
"(default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
def main(): def main():
augmentation_pipeline = AugmentationPipeline(args.augmentation_config) print_arguments(args)
augmentation_pipeline = AugmentationPipeline('{}')
audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type) audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type)
def augment_and_featurize(audio_segment): def augment_and_featurize(audio_segment):
@ -57,7 +44,7 @@ def main():
manifest_path=args.manifest_path, manifest_path=args.manifest_path,
featurize_func=augment_and_featurize, featurize_func=augment_and_featurize,
num_samples=args.num_samples) num_samples=args.num_samples)
normalizer.write_to_file(args.output_file) normalizer.write_to_file(args.output_path)
if __name__ == '__main__': if __name__ == '__main__':

@ -0,0 +1,131 @@
"""Beam search parameters tuning for DeepSpeech2 model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import argparse
import functools
import paddle.v2 as paddle
import _init_paths
from data_utils.data import DataGenerator
from models.model import DeepSpeech2Model
from utils.error_rate import wer
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('num_samples', int, 100, "# of samples to infer.")
add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
add_arg('beam_size', int, 500, "Beam search width.")
add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.")
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
add_arg('num_alphas', int, 14, "# of alpha candidates for tuning.")
add_arg('num_betas', int, 20, "# of beta candidates for tuning.")
add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.")
add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.")
add_arg('beta_from', float, 0.05, "Where beta starts tuning from.")
add_arg('beta_to', float, 1.0, "Where beta ends tuning with.")
add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
add_arg('use_gpu', bool, True, "Use GPU or not.")
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"bi-directional RNNs. Not for GRU.")
add_arg('tune_manifest', str,
'data/librispeech/manifest.dev',
"Filepath of manifest to tune.")
add_arg('mean_std_path', str,
'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'data/librispeech/eng_vocab.txt',
"Filepath of vocabulary.")
add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.")
add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz',
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.")
add_arg('error_rate_type', str,
'wer',
"Error rate type for evaluation.",
choices=['wer', 'cer'])
add_arg('specgram_type', str,
'linear',
"Audio feature type. Options: linear, mfcc.",
choices=['linear', 'mfcc'])
# yapf: disable
args = parser.parse_args()
def tune():
"""Tune parameters alpha and beta on one minibatch."""
if not args.num_alphas >= 0:
raise ValueError("num_alphas must be non-negative!")
if not args.num_betas >= 0:
raise ValueError("num_betas must be non-negative!")
data_generator = DataGenerator(
vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_path,
augmentation_config='{}',
specgram_type=args.specgram_type,
num_threads=1)
batch_reader = data_generator.batch_reader_creator(
manifest_path=args.tune_manifest,
batch_size=args.num_samples,
sortagrad=False,
shuffle_method=None)
tune_data = batch_reader().next()
target_transcripts = [
''.join([data_generator.vocab_list[token] for token in transcript])
for _, transcript in tune_data
]
ds2_model = DeepSpeech2Model(
vocab_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights)
# create grid for search
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
params_grid = [(alpha, beta) for alpha in cand_alphas
for beta in cand_betas]
## tune parameters in loop
for alpha, beta in params_grid:
result_transcripts = ds2_model.infer_batch(
infer_data=tune_data,
decoding_method='ctc_beam_search',
beam_alpha=alpha,
beam_beta=beta,
beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list,
language_model_path=args.lang_model_path,
num_processes=args.num_proc_bsearch)
wer_sum, num_ins = 0.0, 0
for target, result in zip(target_transcripts, result_transcripts):
wer_sum += wer(target, result)
num_ins += 1
print("alpha = %f\tbeta = %f\tWER = %f" %
(alpha, beta, wer_sum / num_ins))
def main():
print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
tune()
if __name__ == '__main__':
main()

@ -4,162 +4,91 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse import argparse
import distutils.util import functools
import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from model import DeepSpeech2Model from models.model import DeepSpeech2Model
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
import utils from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( add_arg = functools.partial(add_arguments, argparser=parser)
"--batch_size", default=256, type=int, help="Minibatch size.") # yapf: disable
parser.add_argument( add_arg('batch_size', int, 256, "Minibatch size.")
"--num_passes", add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
default=200, add_arg('num_passes', int, 200, "# of training epochs.")
type=int, add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.")
help="Training pass number. (default: %(default)s)") add_arg('num_conv_layers', int, 2, "# of convolution layers.")
parser.add_argument( add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
"--num_iterations_print", add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
default=100, add_arg('num_iter_print', int, 100, "Every # iterations for printing "
type=int, "train cost.")
help="Number of iterations for every train cost printing. " add_arg('learning_rate', float, 5e-4, "Learning rate.")
"(default: %(default)s)") add_arg('max_duration', float, 27.0, "Longest audio duration allowed.")
parser.add_argument( add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.")
"--num_conv_layers", add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.")
default=2, add_arg('use_gpu', bool, True, "Use GPU or not.")
type=int, add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
help="Convolution layer number. (default: %(default)s)") add_arg('is_local', bool, True, "Use pserver or not.")
parser.add_argument( add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"--num_rnn_layers", "bi-directional RNNs. Not for GRU.")
default=3, add_arg('train_manifest', str,
type=int, 'data/librispeech/manifest.train',
help="RNN layer number. (default: %(default)s)") "Filepath of train manifest.")
parser.add_argument( add_arg('dev_manifest', str,
"--rnn_layer_size", 'data/librispeech/manifest.dev-clean',
default=512, "Filepath of validation manifest.")
type=int, add_arg('mean_std_path', str,
help="RNN layer cell number. (default: %(default)s)") 'data/librispeech/mean_std.npz',
parser.add_argument( "Filepath of normalizer's mean & std.")
"--adam_learning_rate", add_arg('vocab_path', str,
default=5e-4, 'data/librispeech/eng_vocab.txt',
type=float, "Filepath of vocabulary.")
help="Learning rate for ADAM Optimizer. (default: %(default)s)") add_arg('init_model_path', str,
parser.add_argument( None,
"--use_gpu", "If None, the training starts from scratch, "
default=True, "otherwise, it resumes from the pre-trained model.")
type=distutils.util.strtobool, add_arg('output_model_dir', str,
help="Use gpu or not. (default: %(default)s)") "./checkpoints",
parser.add_argument( "Directory for saving checkpoints.")
"--use_sortagrad", add_arg('augment_conf_path',str,
default=True, 'conf/augmentation.config',
type=distutils.util.strtobool, "Filepath of augmentation configuration file (json-format).")
help="Use sortagrad or not. (default: %(default)s)") add_arg('specgram_type', str,
parser.add_argument( 'linear',
"--specgram_type", "Audio feature type. Options: linear, mfcc.",
default='linear', choices=['linear', 'mfcc'])
type=str, add_arg('shuffle_method', str,
help="Feature type of audio data: 'linear' (power spectrum)" 'batch_shuffle_clipped',
" or 'mfcc'. (default: %(default)s)") "Shuffle method.",
parser.add_argument( choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped'])
"--max_duration", # yapf: disable
default=27.0,
type=float,
help="Audios with duration larger than this will be discarded. "
"(default: %(default)s)")
parser.add_argument(
"--min_duration",
default=0.0,
type=float,
help="Audios with duration smaller than this will be discarded. "
"(default: %(default)s)")
parser.add_argument(
"--shuffle_method",
default='batch_shuffle_clipped',
type=str,
help="Shuffle method: 'instance_shuffle', 'batch_shuffle', "
"'batch_shuffle_batch'. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--num_threads_data",
default=multiprocessing.cpu_count() // 2,
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--train_manifest_path",
default='datasets/manifest.train',
type=str,
help="Manifest path for training. (default: %(default)s)")
parser.add_argument(
"--dev_manifest_path",
default='datasets/manifest.dev',
type=str,
help="Manifest path for validation. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--init_model_path",
default=None,
type=str,
help="If set None, the training will start from scratch. "
"Otherwise, the training will resume from "
"the existing model of this path. (default: %(default)s)")
parser.add_argument(
"--output_model_dir",
default="./checkpoints",
type=str,
help="Directory for saving models. (default: %(default)s)")
parser.add_argument(
"--augmentation_config",
default=open('conf/augmentation.config', 'r').read(),
type=str,
help="Augmentation configuration in json-format. "
"(default: %(default)s)")
parser.add_argument(
"--is_local",
default=True,
type=distutils.util.strtobool,
help="Set to false if running with pserver in paddlecloud. "
"(default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
def train(): def train():
"""DeepSpeech2 training.""" """DeepSpeech2 training."""
train_generator = DataGenerator( train_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config=args.augmentation_config, augmentation_config=open(args.augment_conf_path, 'r').read(),
max_duration=args.max_duration, max_duration=args.max_duration,
min_duration=args.min_duration, min_duration=args.min_duration,
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.num_proc_data)
dev_generator = DataGenerator( dev_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config="{}", augmentation_config="{}",
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.num_proc_data)
train_batch_reader = train_generator.batch_reader_creator( train_batch_reader = train_generator.batch_reader_creator(
manifest_path=args.train_manifest_path, manifest_path=args.train_manifest,
batch_size=args.batch_size, batch_size=args.batch_size,
min_batch_size=args.trainer_count, min_batch_size=args.trainer_count,
sortagrad=args.use_sortagrad if args.init_model_path is None else False, sortagrad=args.use_sortagrad if args.init_model_path is None else False,
shuffle_method=args.shuffle_method) shuffle_method=args.shuffle_method)
dev_batch_reader = dev_generator.batch_reader_creator( dev_batch_reader = dev_generator.batch_reader_creator(
manifest_path=args.dev_manifest_path, manifest_path=args.dev_manifest,
batch_size=args.batch_size, batch_size=args.batch_size,
min_batch_size=1, # must be 1, but will have errors. min_batch_size=1, # must be 1, but will have errors.
sortagrad=False, sortagrad=False,
@ -170,21 +99,23 @@ def train():
num_conv_layers=args.num_conv_layers, num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
pretrained_model_path=args.init_model_path) use_gru=args.use_gru,
pretrained_model_path=args.init_model_path,
share_rnn_weights=args.share_rnn_weights)
ds2_model.train( ds2_model.train(
train_batch_reader=train_batch_reader, train_batch_reader=train_batch_reader,
dev_batch_reader=dev_batch_reader, dev_batch_reader=dev_batch_reader,
feeding_dict=train_generator.feeding, feeding_dict=train_generator.feeding,
learning_rate=args.adam_learning_rate, learning_rate=args.learning_rate,
gradient_clipping=400, gradient_clipping=400,
num_passes=args.num_passes, num_passes=args.num_passes,
num_iterations_print=args.num_iterations_print, num_iterations_print=args.num_iter_print,
output_model_dir=args.output_model_dir, output_model_dir=args.output_model_dir,
is_local=args.is_local) is_local=args.is_local)
def main(): def main():
utils.print_arguments(args) print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
train() train()

@ -1,196 +0,0 @@
"""Parameters tuning for DeepSpeech2 model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import distutils.util
import argparse
import multiprocessing
import paddle.v2 as paddle
from data_utils.data import DataGenerator
from model import DeepSpeech2Model
from error_rate import wer
import utils
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--num_samples",
default=100,
type=int,
help="Number of samples for parameters tuning. (default: %(default)s)")
parser.add_argument(
"--num_conv_layers",
default=2,
type=int,
help="Convolution layer number. (default: %(default)s)")
parser.add_argument(
"--num_rnn_layers",
default=3,
type=int,
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=512,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--num_threads_data",
default=1,
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument(
"--num_processes_beam_search",
default=multiprocessing.cpu_count() // 2,
type=int,
help="Number of cpu processes for beam search. (default: %(default)s)")
parser.add_argument(
"--specgram_type",
default='linear',
type=str,
help="Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--tune_manifest_path",
default='datasets/manifest.dev',
type=str,
help="Manifest path for tuning. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--beam_size",
default=500,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha_from",
default=0.1,
type=float,
help="Where alpha starts from. (default: %(default)f)")
parser.add_argument(
"--num_alphas",
default=14,
type=int,
help="Number of candidate alphas. (default: %(default)d)")
parser.add_argument(
"--alpha_to",
default=0.36,
type=float,
help="Where alpha ends with. (default: %(default)f)")
parser.add_argument(
"--beta_from",
default=0.05,
type=float,
help="Where beta starts from. (default: %(default)f)")
parser.add_argument(
"--num_betas",
default=20,
type=float,
help="Number of candidate betas. (default: %(default)d)")
parser.add_argument(
"--beta_to",
default=1.0,
type=float,
help="Where beta ends with. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
args = parser.parse_args()
def tune():
"""Tune parameters alpha and beta on one minibatch."""
if not args.num_alphas >= 0:
raise ValueError("num_alphas must be non-negative!")
if not args.num_betas >= 0:
raise ValueError("num_betas must be non-negative!")
data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath,
augmentation_config='{}',
specgram_type=args.specgram_type,
num_threads=args.num_threads_data)
batch_reader = data_generator.batch_reader_creator(
manifest_path=args.tune_manifest_path,
batch_size=args.num_samples,
sortagrad=False,
shuffle_method=None)
tune_data = batch_reader().next()
target_transcripts = [
''.join([data_generator.vocab_list[token] for token in transcript])
for _, transcript in tune_data
]
ds2_model = DeepSpeech2Model(
vocab_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
pretrained_model_path=args.model_filepath)
# create grid for search
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
params_grid = [(alpha, beta) for alpha in cand_alphas
for beta in cand_betas]
## tune parameters in loop
for alpha, beta in params_grid:
result_transcripts = ds2_model.infer_batch(
infer_data=tune_data,
decode_method='beam_search',
beam_alpha=alpha,
beam_beta=beta,
beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path,
num_processes=args.num_processes_beam_search)
wer_sum, num_ins = 0.0, 0
for target, result in zip(target_transcripts, result_transcripts):
wer_sum += wer(target, result)
num_ins += 1
print("alpha = %f\tbeta = %f\tWER = %f" %
(alpha, beta, wer_sum / num_ins))
def main():
utils.print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
tune()
if __name__ == '__main__':
main()

@ -1,25 +0,0 @@
"""Contains common utility functions."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
def print_arguments(args):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
print("----- Configuration Arguments -----")
for arg, value in vars(args).iteritems():
print("%s: %s" % (arg, value))
print("------------------------------------")

@ -5,7 +5,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import error_rate from utils import error_rate
class TestParse(unittest.TestCase): class TestParse(unittest.TestCase):

@ -0,0 +1,47 @@
"""Contains common utility functions."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import distutils.util
def print_arguments(args):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
Loading…
Cancel
Save