diff --git a/README.md b/README.md old mode 100755 new mode 100644 index 4e8befa5..db07d8c2 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # DeepSpeech2 on PaddlePaddle +>TODO: to be updated, since the directory hierarchy was changed. + ## Installation ``` diff --git a/cloud/README.md b/cloud/README.md old mode 100755 new mode 100644 diff --git a/cloud/pcloud_submit.sh b/cloud/pcloud_submit.sh index a7fb42cb..378a7c6e 100644 --- a/cloud/pcloud_submit.sh +++ b/cloud/pcloud_submit.sh @@ -1,7 +1,9 @@ -TRAIN_MANIFEST="cloud/cloud.manifest.train" -DEV_MANIFEST="cloud/cloud.manifest.dev" -CLOUD_MODEL_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/model" -BATCH_SIZE=256 +#! /usr/bin/bash + +TRAIN_MANIFEST="cloud/cloud_manifests/cloud.manifest.train" +DEV_MANIFEST="cloud/cloud_manifests/cloud.manifest.dev" +CLOUD_MODEL_DIR="./checkpoints" +BATCH_SIZE=512 NUM_GPU=8 NUM_NODE=1 IS_LOCAL="True" @@ -11,7 +13,7 @@ DS2_PATH=${PWD%/*} cp -f pcloud_train.sh ${DS2_PATH} paddlecloud submit \ --image bootstrapper:5000/wanghaoshuang/pcloud_ds2:latest \ +-image bootstrapper:5000/paddlepaddle/pcloud_ds2:latest \ -jobname ${JOB_NAME} \ -cpu ${NUM_GPU} \ -gpu ${NUM_GPU} \ diff --git a/cloud/pcloud_train.sh b/cloud/pcloud_train.sh index e42da1d6..d04132f9 100644 --- a/cloud/pcloud_train.sh +++ b/cloud/pcloud_train.sh @@ -1,3 +1,5 @@ +#! /usr/bin/bash + TRAIN_MANIFEST=$1 DEV_MANIFEST=$2 MODEL_PATH=$3 @@ -13,12 +15,30 @@ python ./cloud/split_data.py \ --in_manifest_path=${DEV_MANIFEST} \ --out_manifest_path='/local.manifest.dev' -python train.py \ ---batch_size=$BATCH_SIZE \ ---use_gpu=1 \ +python -u train.py \ +--batch_size=${BATCH_SIZE} \ --trainer_count=${NUM_GPU} \ ---num_threads_data=${NUM_GPU} \ +--num_passes=200 \ +--num_proc_data=${NUM_GPU} \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--num_iter_print=100 \ +--learning_rate=5e-4 \ +--max_duration=27.0 \ +--min_duration=0.0 \ +--use_sortagrad=True \ +--use_gru=False \ +--use_gpu=True \ --is_local=${IS_LOCAL} \ ---train_manifest_path='/local.manifest.train' \ ---dev_manifest_path='/local.manifest.dev' \ +--share_rnn_weights=True \ +--train_manifest='/local.manifest.train' \ +--dev_manifest='/local.manifest.dev' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--output_model_dir='./checkpoints' \ --output_model_dir=${MODEL_PATH} \ +--augment_conf_path='conf/augmentation.config' \ +--specgram_type='linear' \ +--shuffle_method='batch_shuffle_clipped' \ +2>&1 | tee ./log/train.log diff --git a/cloud/pcloud_upload_data.sh b/cloud/pcloud_upload_data.sh index 97a0ab18..4ef235ef 100644 --- a/cloud/pcloud_upload_data.sh +++ b/cloud/pcloud_upload_data.sh @@ -1,5 +1,9 @@ -IN_MANIFESTS="../datasets/manifest.train ../datasets/manifest.dev ../datasets/manifest.test" -OUT_MANIFESTS="./cloud.manifest.train ./cloud.manifest.dev ./cloud.manifest.test" +#! /usr/bin/bash + +mkdir cloud_manifests + +IN_MANIFESTS="../data/librispeech/manifest.train ../data/librispeech/manifest.dev-clean ../data/librispeech/manifest.test-clean" +OUT_MANIFESTS="cloud_manifests/cloud.manifest.train cloud_manifests/cloud.manifest.dev cloud_manifests/cloud.manifest.test" CLOUD_DATA_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/data/librispeech" NUM_SHARDS=50 @@ -14,4 +18,5 @@ then echo "Upload Data Failed!" exit 1 fi + echo "All Done." diff --git a/datasets/vocab/eng_vocab.txt b/data/librispeech/eng_vocab.txt similarity index 100% rename from datasets/vocab/eng_vocab.txt rename to data/librispeech/eng_vocab.txt diff --git a/datasets/librispeech/librispeech.py b/data/librispeech/librispeech.py similarity index 100% rename from datasets/librispeech/librispeech.py rename to data/librispeech/librispeech.py diff --git a/datasets/noise/chime3_background.py b/data/noise/chime3_background.py similarity index 100% rename from datasets/noise/chime3_background.py rename to data/noise/chime3_background.py diff --git a/data_utils/augmentor/impulse_response.py b/data_utils/augmentor/impulse_response.py index c3de0fdb..536b4d6a 100644 --- a/data_utils/augmentor/impulse_response.py +++ b/data_utils/augmentor/impulse_response.py @@ -4,23 +4,22 @@ from __future__ import division from __future__ import print_function from data_utils.augmentor.base import AugmentorBase -from data_utils import utils +from data_utils.utility import read_manifest from data_utils.audio import AudioSegment class ImpulseResponseAugmentor(AugmentorBase): """Augmentation model for adding impulse response effect. - + :param rng: Random generator object. :type rng: random.Random :param impulse_manifest_path: Manifest path for impulse audio data. - :type impulse_manifest_path: basestring + :type impulse_manifest_path: basestring """ def __init__(self, rng, impulse_manifest_path): self._rng = rng - self._impulse_manifest = utils.read_manifest( - manifest_path=impulse_manifest_path) + self._impulse_manifest = read_manifest(impulse_manifest_path) def transform_audio(self, audio_segment): """Add impulse response effect. diff --git a/data_utils/augmentor/noise_perturb.py b/data_utils/augmentor/noise_perturb.py index 281174af..96e0ff4d 100644 --- a/data_utils/augmentor/noise_perturb.py +++ b/data_utils/augmentor/noise_perturb.py @@ -4,13 +4,13 @@ from __future__ import division from __future__ import print_function from data_utils.augmentor.base import AugmentorBase -from data_utils import utils +from data_utils.utility import read_manifest from data_utils.audio import AudioSegment class NoisePerturbAugmentor(AugmentorBase): """Augmentation model for adding background noise. - + :param rng: Random generator object. :type rng: random.Random :param min_snr_dB: Minimal signal noise ratio, in decibels. @@ -18,15 +18,14 @@ class NoisePerturbAugmentor(AugmentorBase): :param max_snr_dB: Maximal signal noise ratio, in decibels. :type max_snr_dB: float :param noise_manifest_path: Manifest path for noise audio data. - :type noise_manifest_path: basestring + :type noise_manifest_path: basestring """ def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path): self._min_snr_dB = min_snr_dB self._max_snr_dB = max_snr_dB self._rng = rng - self._noise_manifest = utils.read_manifest( - manifest_path=noise_manifest_path) + self._noise_manifest = read_manifest(manifest_path=noise_manifest_path) def transform_audio(self, audio_segment): """Add background noise audio. diff --git a/data_utils/data.py b/data_utils/data.py index 98180b4b..8bff6826 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -11,7 +11,7 @@ import multiprocessing import numpy as np import paddle.v2 as paddle from threading import local -from data_utils import utils +from data_utils.utility import read_manifest from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.featurizer.speech_featurizer import SpeechFeaturizer from data_utils.speech import SpeechSegment @@ -85,9 +85,9 @@ class DataGenerator(object): self._rng = random.Random(random_seed) self._epoch = 0 # for caching tar files info - self.local_data = local() - self.local_data.tar2info = {} - self.local_data.tar2object = {} + self._local_data = local() + self._local_data.tar2info = {} + self._local_data.tar2object = {} def process_utterance(self, filename, transcript): """Load, augment, featurize and normalize for speech data. @@ -159,7 +159,7 @@ class DataGenerator(object): def batch_reader(): # read manifest - manifest = utils.read_manifest( + manifest = read_manifest( manifest_path=manifest_path, max_duration=self._max_duration, min_duration=self._min_duration) @@ -240,16 +240,16 @@ class DataGenerator(object): """ if file.startswith('tar:'): tarpath, filename = file.split(':', 1)[1].split('#', 1) - if 'tar2info' not in self.local_data.__dict__: - self.local_data.tar2info = {} - if 'tar2object' not in self.local_data.__dict__: - self.local_data.tar2object = {} - if tarpath not in self.local_data.tar2info: + if 'tar2info' not in self._local_data.__dict__: + self._local_data.tar2info = {} + if 'tar2object' not in self._local_data.__dict__: + self._local_data.tar2object = {} + if tarpath not in self._local_data.tar2info: object, infoes = self._parse_tar(tarpath) - self.local_data.tar2info[tarpath] = infoes - self.local_data.tar2object[tarpath] = object - return self.local_data.tar2object[tarpath].extractfile( - self.local_data.tar2info[tarpath][filename]) + self._local_data.tar2info[tarpath] = infoes + self._local_data.tar2object[tarpath] = object + return self._local_data.tar2object[tarpath].extractfile( + self._local_data.tar2info[tarpath][filename]) else: return open(file, 'r') diff --git a/data_utils/featurizer/audio_featurizer.py b/data_utils/featurizer/audio_featurizer.py index 39f45301..12f8784a 100644 --- a/data_utils/featurizer/audio_featurizer.py +++ b/data_utils/featurizer/audio_featurizer.py @@ -4,7 +4,7 @@ from __future__ import division from __future__ import print_function import numpy as np -from data_utils import utils +from data_utils.utility import read_manifest from data_utils.audio import AudioSegment from python_speech_features import mfcc from python_speech_features import delta diff --git a/data_utils/normalizer.py b/data_utils/normalizer.py index 1f4aae9a..7c2e05c9 100644 --- a/data_utils/normalizer.py +++ b/data_utils/normalizer.py @@ -5,7 +5,7 @@ from __future__ import print_function import numpy as np import random -import data_utils.utils as utils +from data_utils.utility import read_manifest from data_utils.audio import AudioSegment @@ -75,7 +75,7 @@ class FeatureNormalizer(object): def _compute_mean_std(self, manifest_path, featurize_func, num_samples): """Compute mean and std from randomly sampled instances.""" - manifest = utils.read_manifest(manifest_path) + manifest = read_manifest(manifest_path) sampled_manifest = self._rng.sample(manifest, num_samples) features = [] for instance in sampled_manifest: diff --git a/data_utils/utils.py b/data_utils/utility.py similarity index 100% rename from data_utils/utils.py rename to data_utils/utility.py diff --git a/datasets/run_all.sh b/datasets/run_all.sh deleted file mode 100644 index ef2b721f..00000000 --- a/datasets/run_all.sh +++ /dev/null @@ -1,13 +0,0 @@ -cd librispeech -python librispeech.py -if [ $? -ne 0 ]; then - echo "Prepare LibriSpeech failed. Terminated." - exit 1 -fi -cd - - -cat librispeech/manifest.train* | shuf > manifest.train -cat librispeech/manifest.dev-clean > manifest.dev -cat librispeech/manifest.test-clean > manifest.test - -echo "All done." diff --git a/datasets/run_noise.sh b/datasets/run_noise.sh deleted file mode 100644 index 7b27abde..00000000 --- a/datasets/run_noise.sh +++ /dev/null @@ -1,10 +0,0 @@ -cd noise -python chime3_background.py -if [ $? -ne 0 ]; then - echo "Prepare CHiME3 background noise failed. Terminated." - exit 1 -fi -cd - - -cat noise/manifest.* > manifest.noise -echo "All done." diff --git a/deploy/_init_paths.py b/deploy/_init_paths.py new file mode 100644 index 00000000..ddabb535 --- /dev/null +++ b/deploy/_init_paths.py @@ -0,0 +1,19 @@ +"""Set up paths for DS2""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os.path +import sys + + +def add_path(path): + if path not in sys.path: + sys.path.insert(0, path) + + +this_dir = os.path.dirname(__file__) + +# Add project path to PYTHONPATH +proj_path = os.path.join(this_dir, '..') +add_path(proj_path) diff --git a/demo_client.py b/deploy/demo_client.py similarity index 100% rename from demo_client.py rename to deploy/demo_client.py diff --git a/demo_server.py b/deploy/demo_server.py similarity index 60% rename from demo_server.py rename to deploy/demo_server.py index c7e7e94a..658b1419 100644 --- a/demo_server.py +++ b/deploy/demo_server.py @@ -3,111 +3,64 @@ import os import time import random import argparse -import distutils.util +import functools from time import gmtime, strftime import SocketServer import struct import wave import paddle.v2 as paddle -from utils import print_arguments +import _init_paths from data_utils.data import DataGenerator -from model import DeepSpeech2Model +from models.model import DeepSpeech2Model from data_utils.utils import read_manifest +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--host_ip", - default="localhost", - type=str, - help="Server IP address. (default: %(default)s)") -parser.add_argument( - "--host_port", - default=8086, - type=int, - help="Server Port. (default: %(default)s)") -parser.add_argument( - "--speech_save_dir", - default="demo_cache", - type=str, - help="Directory for saving demo speech. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--warmup_manifest_path", - default='datasets/manifest.test', - type=str, - help="Manifest path for warmup test. (default: %(default)s)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=512, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--model_filepath", - default='checkpoints/params.latest.tar.gz', - type=str, - help="Model filepath. (default: %(default)s)") -parser.add_argument( - "--decode_method", - default='beam_search', - type=str, - help="Method for ctc decoding: best_path or beam_search. " - "(default: %(default)s)") -parser.add_argument( - "--beam_size", - default=100, - type=int, - help="Width for beam search decoding. (default: %(default)d)") -parser.add_argument( - "--language_model_path", - default="lm/data/common_crawl_00.prune01111.trie.klm", - type=str, - help="Path for language model. (default: %(default)s)") -parser.add_argument( - "--alpha", - default=0.36, - type=float, - help="Parameter associated with language model. (default: %(default)f)") -parser.add_argument( - "--beta", - default=0.25, - type=float, - help="Parameter associated with word count. (default: %(default)f)") -parser.add_argument( - "--cutoff_prob", - default=0.99, - type=float, - help="The cutoff probability of pruning" - "in beam search. (default: %(default)f)") +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('host_port', int, 8086, "Server's IP port.") +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +add_arg('host_ip', str, + 'localhost', + "Server's IP address.") +add_arg('speech_save_dir', str, + 'demo_cache', + "Directory to save demo audios.") +add_arg('warmup_manifest', str, + 'data/librispeech/manifest.test-clean', + "Filepath of manifest to warm up.") +add_arg('mean_std_path', str, + 'data/librispeech/mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'data/librispeech/eng_vocab.txt', + "Filepath of vocabulary.") +add_arg('model_path', str, + './checkpoints/params.latest.tar.gz', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('decoding_method', str, + 'ctc_beam_search', + "Decoding method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# yapf: disable args = parser.parse_args() @@ -188,8 +141,8 @@ def start_server(): """Start the ASR server""" # prepare data generator data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1) @@ -199,20 +152,22 @@ def start_server(): num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, - pretrained_model_path=args.model_filepath) + use_gru=args.use_gru, + pretrained_model_path=args.model_path, + share_rnn_weights=args.share_rnn_weights) # prepare ASR inference handler def file_to_transcript(filename): feature = data_generator.process_utterance(filename, "") result_transcript = ds2_model.infer_batch( infer_data=[feature], - decode_method=args.decode_method, + decoding_method=args.decoding_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, - language_model_path=args.language_model_path, + language_model_path=args.lang_model_path, num_processes=1) return result_transcript[0] @@ -221,7 +176,7 @@ def start_server(): print('Warming up ...') warm_up_test( audio_process_handler=file_to_transcript, - manifest_path=args.warmup_manifest_path, + manifest_path=args.warmup_manifest, num_test_cases=3) print('-----------------------------------------------------------') diff --git a/evaluate.py b/evaluate.py deleted file mode 100644 index 82dcec3c..00000000 --- a/evaluate.py +++ /dev/null @@ -1,180 +0,0 @@ -"""Evaluation for DeepSpeech2 model.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import distutils.util -import argparse -import multiprocessing -import paddle.v2 as paddle -from data_utils.data import DataGenerator -from model import DeepSpeech2Model -from error_rate import wer, cer -import utils - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--batch_size", - default=128, - type=int, - help="Minibatch size for evaluation. (default: %(default)s)") -parser.add_argument( - "--trainer_count", - default=8, - type=int, - help="Trainer number. (default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=512, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--num_threads_data", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu threads for preprocessing data. (default: %(default)s)") -parser.add_argument( - "--num_processes_beam_search", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu processes for beam search. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--decode_method", - default='beam_search', - type=str, - help="Method for ctc decoding, best_path or beam_search. " - "(default: %(default)s)") -parser.add_argument( - "--language_model_path", - default="lm/data/common_crawl_00.prune01111.trie.klm", - type=str, - help="Path for language model. (default: %(default)s)") -parser.add_argument( - "--alpha", - default=0.36, - type=float, - help="Parameter associated with language model. (default: %(default)f)") -parser.add_argument( - "--beta", - default=0.25, - type=float, - help="Parameter associated with word count. (default: %(default)f)") -parser.add_argument( - "--cutoff_prob", - default=0.99, - type=float, - help="The cutoff probability of pruning" - "in beam search. (default: %(default)f)") -parser.add_argument( - "--beam_size", - default=500, - type=int, - help="Width for beam search decoding. (default: %(default)d)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--decode_manifest_path", - default='datasets/manifest.test', - type=str, - help="Manifest path for decoding. (default: %(default)s)") -parser.add_argument( - "--model_filepath", - default='checkpoints/params.latest.tar.gz', - type=str, - help="Model filepath. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--error_rate_type", - default='wer', - choices=['wer', 'cer'], - type=str, - help="Error rate type for evaluation. 'wer' for word error rate and 'cer' " - "for character error rate. " - "(default: %(default)s)") -args = parser.parse_args() - - -def evaluate(): - """Evaluate on whole test data for DeepSpeech2.""" - data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, - augmentation_config='{}', - specgram_type=args.specgram_type, - num_threads=args.num_threads_data) - batch_reader = data_generator.batch_reader_creator( - manifest_path=args.decode_manifest_path, - batch_size=args.batch_size, - min_batch_size=1, - sortagrad=False, - shuffle_method=None) - - ds2_model = DeepSpeech2Model( - vocab_size=data_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_layer_size=args.rnn_layer_size, - pretrained_model_path=args.model_filepath) - - error_rate_func = cer if args.error_rate_type == 'cer' else wer - error_sum, num_ins = 0.0, 0 - for infer_data in batch_reader(): - result_transcripts = ds2_model.infer_batch( - infer_data=infer_data, - decode_method=args.decode_method, - beam_alpha=args.alpha, - beam_beta=args.beta, - beam_size=args.beam_size, - cutoff_prob=args.cutoff_prob, - vocab_list=data_generator.vocab_list, - language_model_path=args.language_model_path, - num_processes=args.num_processes_beam_search) - target_transcripts = [ - ''.join([data_generator.vocab_list[token] for token in transcript]) - for _, transcript in infer_data - ] - for target, result in zip(target_transcripts, result_transcripts): - error_sum += error_rate_func(target, result) - num_ins += 1 - print("Error rate [%s] (%d/?) = %f" % - (args.error_rate_type, num_ins, error_sum / num_ins)) - print("Final error rate [%s] (%d/%d) = %f" % - (args.error_rate_type, num_ins, num_ins, error_sum / num_ins)) - - -def main(): - utils.print_arguments(args) - paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) - evaluate() - - -if __name__ == '__main__': - main() diff --git a/examples/librispeech/generate.sh b/examples/librispeech/generate.sh new file mode 100644 index 00000000..a34b7bc1 --- /dev/null +++ b/examples/librispeech/generate.sh @@ -0,0 +1,28 @@ +#! /usr/bin/bash + +pushd ../.. + +CUDA_VISIBLE_DEVICES=0 \ +python -u infer.py \ +--num_samples=10 \ +--trainer_count=1 \ +--beam_size=500 \ +--num_proc_bsearch=12 \ +--num_proc_data=12 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--alpha=0.36 \ +--beta=0.25 \ +--cutoff_prob=0.99 \ +--use_gru=False \ +--use_gpu=True \ +--share_rnn_weights=True \ +--infer_manifest='data/librispeech/manifest.dev-clean' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--model_path='checkpoints/params.latest.tar.gz' \ +--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ +--decoding_method='ctc_beam_search' \ +--error_rate_type='wer' \ +--specgram_type='linear' diff --git a/examples/librispeech/prepare_data.sh b/examples/librispeech/prepare_data.sh new file mode 100644 index 00000000..162a38c4 --- /dev/null +++ b/examples/librispeech/prepare_data.sh @@ -0,0 +1,32 @@ +#! /usr/bin/bash + +pushd ../.. + +# download data, generate manifests +python data/librispeech/librispeech.py \ +--manifest_prefix='data/librispeech/manifest' \ +--full_download='True' \ +--target_dir='~/.cache/paddle/dataset/speech/Libri' + +if [ $? -ne 0 ]; then + echo "Prepare LibriSpeech failed. Terminated." + exit 1 +fi + +#cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train + + +# compute mean and stddev for normalizer +python tools/compute_mean_std.py \ +--manifest_path='data/librispeech/manifest.train' \ +--num_samples=2000 \ +--specgram_type='linear' \ +--output_path='data/librispeech/mean_std.npz' + +if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 +fi + + +echo "LibriSpeech Data preparation done." diff --git a/examples/librispeech/run_test.sh b/examples/librispeech/run_test.sh new file mode 100644 index 00000000..5a14cb68 --- /dev/null +++ b/examples/librispeech/run_test.sh @@ -0,0 +1,28 @@ +#! /usr/bin/bash + +pushd ../.. + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +python -u evaluate.py \ +--batch_size=128 \ +--trainer_count=8 \ +--beam_size=500 \ +--num_proc_bsearch=12 \ +--num_proc_data=12 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--alpha=0.36 \ +--beta=0.25 \ +--cutoff_prob=0.99 \ +--use_gru=False \ +--use_gpu=True \ +--share_rnn_weights=True \ +--test_manifest='data/librispeech/manifest.test-clean' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--model_path='checkpoints/params.latest.tar.gz' \ +--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ +--decoding_method='ctc_beam_search' \ +--error_rate_type='wer' \ +--specgram_type='linear' diff --git a/examples/librispeech/run_train.sh b/examples/librispeech/run_train.sh new file mode 100644 index 00000000..832838a8 --- /dev/null +++ b/examples/librispeech/run_train.sh @@ -0,0 +1,30 @@ +#! /usr/bin/bash + +pushd ../.. + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +python -u train.py \ +--batch_size=256 \ +--trainer_count=8 \ +--num_passes=200 \ +--num_proc_data=12 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--num_iter_print=100 \ +--learning_rate=5e-4 \ +--max_duration=27.0 \ +--min_duration=0.0 \ +--use_sortagrad=True \ +--use_gru=False \ +--use_gpu=True \ +--is_local=True \ +--share_rnn_weights=True \ +--train_manifest='data/librispeech/manifest.train' \ +--dev_manifest='data/librispeech/manifest.dev' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--output_model_dir='./checkpoints' \ +--augment_conf_path='conf/augmentation.config' \ +--specgram_type='linear' \ +--shuffle_method='batch_shuffle_clipped' diff --git a/examples/librispeech/run_tune.sh b/examples/librispeech/run_tune.sh new file mode 100644 index 00000000..9d992e88 --- /dev/null +++ b/examples/librispeech/run_tune.sh @@ -0,0 +1,30 @@ +#! /usr/bin/bash + +pushd ../.. + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +python -u tools/tune.py \ +--num_samples=100 \ +--trainer_count=8 \ +--beam_size=500 \ +--num_proc_bsearch=12 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--num_alphas=14 \ +--num_betas=20 \ +--alpha_from=0.1 \ +--alpha_to=0.36 \ +--beta_from=0.05 \ +--beta_to=1.0 \ +--cutoff_prob=0.99 \ +--use_gru=False \ +--use_gpu=True \ +--share_rnn_weights=True \ +--tune_manifest='data/librispeech/manifest.dev-clean' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--model_path='checkpoints/params.latest.tar.gz' \ +--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ +--error_rate_type='wer' \ +--specgram_type='linear' diff --git a/infer.py b/infer.py index 43643cde..1ce969ae 100644 --- a/infer.py +++ b/infer.py @@ -4,134 +4,72 @@ from __future__ import division from __future__ import print_function import argparse -import distutils.util -import multiprocessing +import functools import paddle.v2 as paddle from data_utils.data import DataGenerator -from model import DeepSpeech2Model -from error_rate import wer, cer -import utils +from models.model import DeepSpeech2Model +from utils.error_rate import wer, cer +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--num_samples", - default=10, - type=int, - help="Number of samples for inference. (default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=512, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--num_threads_data", - default=1, - type=int, - help="Number of cpu threads for preprocessing data. (default: %(default)s)") -parser.add_argument( - "--num_processes_beam_search", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu processes for beam search. (default: %(default)s)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--trainer_count", - default=8, - type=int, - help="Trainer number. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--decode_manifest_path", - default='datasets/manifest.test', - type=str, - help="Manifest path for decoding. (default: %(default)s)") -parser.add_argument( - "--model_filepath", - default='checkpoints/params.latest.tar.gz', - type=str, - help="Model filepath. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--decode_method", - default='beam_search', - type=str, - help="Method for ctc decoding: best_path or beam_search. " - "(default: %(default)s)") -parser.add_argument( - "--beam_size", - default=500, - type=int, - help="Width for beam search decoding. (default: %(default)d)") -parser.add_argument( - "--language_model_path", - default="lm/data/common_crawl_00.prune01111.trie.klm", - type=str, - help="Path for language model. (default: %(default)s)") -parser.add_argument( - "--alpha", - default=0.36, - type=float, - help="Parameter associated with language model. (default: %(default)f)") -parser.add_argument( - "--beta", - default=0.25, - type=float, - help="Parameter associated with word count. (default: %(default)f)") -parser.add_argument( - "--cutoff_prob", - default=0.99, - type=float, - help="The cutoff probability of pruning" - "in beam search. (default: %(default)f)") -parser.add_argument( - "--error_rate_type", - default='wer', - choices=['wer', 'cer'], - type=str, - help="Error rate type for evaluation. 'wer' for word error rate and 'cer' " - "for character error rate. " - "(default: %(default)s)") +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('num_samples', int, 10, "# of samples to infer.") +add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.") +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +add_arg('infer_manifest', str, + 'data/librispeech/manifest.dev-clean', + "Filepath of manifest to infer.") +add_arg('mean_std_path', str, + 'data/librispeech/mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'data/librispeech/eng_vocab.txt', + "Filepath of vocabulary.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('model_path', str, + './checkpoints/params.latest.tar.gz', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") +add_arg('decoding_method', str, + 'ctc_beam_search', + "Decoding method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +add_arg('error_rate_type', str, + 'wer', + "Error rate type for evaluation.", + choices=['wer', 'cer']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# yapf: disable args = parser.parse_args() def infer(): """Inference for DeepSpeech2.""" data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, - num_threads=args.num_threads_data) + num_threads=1) batch_reader = data_generator.batch_reader_creator( - manifest_path=args.decode_manifest_path, + manifest_path=args.infer_manifest, batch_size=args.num_samples, min_batch_size=1, sortagrad=False, @@ -143,17 +81,19 @@ def infer(): num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, - pretrained_model_path=args.model_filepath) + use_gru=args.use_gru, + pretrained_model_path=args.model_path, + share_rnn_weights=args.share_rnn_weights) result_transcripts = ds2_model.infer_batch( infer_data=infer_data, - decode_method=args.decode_method, + decoding_method=args.decoding_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, - language_model_path=args.language_model_path, - num_processes=args.num_processes_beam_search) + language_model_path=args.lang_model_path, + num_processes=args.num_proc_bsearch) error_rate_func = cer if args.error_rate_type == 'cer' else wer target_transcripts = [ @@ -168,7 +108,7 @@ def infer(): def main(): - utils.print_arguments(args) + print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) infer() diff --git a/layer.py b/layer.py deleted file mode 100644 index 3b492645..00000000 --- a/layer.py +++ /dev/null @@ -1,177 +0,0 @@ -"""Contains DeepSpeech2 layers.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle.v2 as paddle - - -def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, - padding, act): - """Convolution layer with batch normalization. - - :param input: Input layer. - :type input: LayerOutput - :param filter_size: The x dimension of a filter kernel. Or input a tuple for - two image dimension. - :type filter_size: int|tuple|list - :param num_channels_in: Number of input channels. - :type num_channels_in: int - :type num_channels_out: Number of output channels. - :type num_channels_in: out - :param padding: The x dimension of the padding. Or input a tuple for two - image dimension. - :type padding: int|tuple|list - :param act: Activation type. - :type act: BaseActivation - :return: Batch norm layer after convolution layer. - :rtype: LayerOutput - """ - conv_layer = paddle.layer.img_conv( - input=input, - filter_size=filter_size, - num_channels=num_channels_in, - num_filters=num_channels_out, - stride=stride, - padding=padding, - act=paddle.activation.Linear(), - bias_attr=False) - return paddle.layer.batch_norm(input=conv_layer, act=act) - - -def bidirectional_simple_rnn_bn_layer(name, input, size, act): - """Bidirectonal simple rnn layer with sequence-wise batch normalization. - The batch normalization is only performed on input-state weights. - - :param name: Name of the layer. - :type name: string - :param input: Input layer. - :type input: LayerOutput - :param size: Number of RNN cells. - :type size: int - :param act: Activation type. - :type act: BaseActivation - :return: Bidirectional simple rnn layer. - :rtype: LayerOutput - """ - # input-hidden weights shared across bi-direcitonal rnn. - input_proj = paddle.layer.fc( - input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) - # batch norm is only performed on input-state projection - input_proj_bn = paddle.layer.batch_norm( - input=input_proj, act=paddle.activation.Linear()) - # forward and backward in time - forward_simple_rnn = paddle.layer.recurrent( - input=input_proj_bn, act=act, reverse=False) - backward_simple_rnn = paddle.layer.recurrent( - input=input_proj_bn, act=act, reverse=True) - return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) - - -def conv_group(input, num_stacks): - """Convolution group with stacked convolution layers. - - :param input: Input layer. - :type input: LayerOutput - :param num_stacks: Number of stacked convolution layers. - :type num_stacks: int - :return: Output layer of the convolution group. - :rtype: LayerOutput - """ - conv = conv_bn_layer( - input=input, - filter_size=(11, 41), - num_channels_in=1, - num_channels_out=32, - stride=(3, 2), - padding=(5, 20), - act=paddle.activation.BRelu()) - for i in xrange(num_stacks - 1): - conv = conv_bn_layer( - input=conv, - filter_size=(11, 21), - num_channels_in=32, - num_channels_out=32, - stride=(1, 2), - padding=(5, 10), - act=paddle.activation.BRelu()) - output_num_channels = 32 - output_height = 160 // pow(2, num_stacks) + 1 - return conv, output_num_channels, output_height - - -def rnn_group(input, size, num_stacks): - """RNN group with stacked bidirectional simple RNN layers. - - :param input: Input layer. - :type input: LayerOutput - :param size: Number of RNN cells in each layer. - :type size: int - :param num_stacks: Number of stacked rnn layers. - :type num_stacks: int - :return: Output layer of the RNN group. - :rtype: LayerOutput - """ - output = input - for i in xrange(num_stacks): - output = bidirectional_simple_rnn_bn_layer( - name=str(i), input=output, size=size, act=paddle.activation.BRelu()) - return output - - -def deep_speech2(audio_data, - text_data, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=256): - """ - The whole DeepSpeech2 model structure (a simplified version). - - :param audio_data: Audio spectrogram data layer. - :type audio_data: LayerOutput - :param text_data: Transcription text data layer. - :type text_data: LayerOutput - :param dict_size: Dictionary size for tokenized transcription. - :type dict_size: int - :param num_conv_layers: Number of stacking convolution layers. - :type num_conv_layers: int - :param num_rnn_layers: Number of stacking RNN layers. - :type num_rnn_layers: int - :param rnn_size: RNN layer size (number of RNN cells). - :type rnn_size: int - :return: A tuple of an output unnormalized log probability layer ( - before softmax) and a ctc cost layer. - :rtype: tuple of LayerOutput - """ - # convolution group - conv_group_output, conv_group_num_channels, conv_group_height = conv_group( - input=audio_data, num_stacks=num_conv_layers) - # convert data form convolution feature map to sequence of vectors - conv2seq = paddle.layer.block_expand( - input=conv_group_output, - num_channels=conv_group_num_channels, - stride_x=1, - stride_y=1, - block_x=1, - block_y=conv_group_height) - # rnn group - rnn_group_output = rnn_group( - input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) - fc = paddle.layer.fc( - input=rnn_group_output, - size=dict_size + 1, - act=paddle.activation.Linear(), - bias_attr=True) - # probability distribution with softmax - log_probs = paddle.layer.mixed( - input=paddle.layer.identity_projection(input=fc), - act=paddle.activation.Softmax()) - # ctc cost - ctc_loss = paddle.layer.warp_ctc( - input=fc, - label=text_data, - size=dict_size + 1, - blank=dict_size, - norm_by_times=True) - return log_probs, ctc_loss diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/decoder.py b/models/decoder.py similarity index 95% rename from decoder.py rename to models/decoder.py index 8f2e0508..61ead25c 100644 --- a/decoder.py +++ b/models/decoder.py @@ -9,8 +9,9 @@ from math import log import multiprocessing -def ctc_best_path_decoder(probs_seq, vocabulary): - """Best path decoder, also called argmax decoder or greedy decoder. +def ctc_greedy_decoder(probs_seq, vocabulary): + """CTC greedy (best path) decoder. + Path consisting of the most probable tokens are further post-processed to remove consecutive repetitions and all blanks. @@ -45,10 +46,12 @@ def ctc_beam_search_decoder(probs_seq, cutoff_prob=1.0, ext_scoring_func=None, nproc=False): - """Beam search decoder for CTC-trained network. It utilizes beam search - to approximately select top best decoding labels and returning results - in the descending order. The implementation is based on Prefix - Beam Search (https://arxiv.org/abs/1408.2873), and the unclear part is + """CTC Beam search decoder. + + It utilizes beam search to approximately select top best decoding + labels and returning results in the descending order. + The implementation is based on Prefix Beam Search + (https://arxiv.org/abs/1408.2873), and the unclear part is redesigned. Two important modifications: 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 2) the if condition "if l^+ not diff --git a/model.py b/models/model.py similarity index 88% rename from model.py rename to models/model.py index 99412e59..93c4c41b 100644 --- a/model.py +++ b/models/model.py @@ -7,10 +7,10 @@ import sys import os import time import gzip -from decoder import * -from lm.lm_scorer import LmScorer import paddle.v2 as paddle -from layer import * +from lm.lm_scorer import LmScorer +from models.decoder import ctc_greedy_decoder, ctc_beam_search_decoder +from models.network import deep_speech_v2_network class DeepSpeech2Model(object): @@ -27,12 +27,17 @@ class DeepSpeech2Model(object): :param pretrained_model_path: Pretrained model path. If None, will train from stratch. :type pretrained_model_path: basestring|None + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward directional RNNs.Notice that + for GRU, weight sharing is not supported. + :type share_rnn_weights: bool """ def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size, pretrained_model_path): + rnn_layer_size, use_gru, pretrained_model_path, + share_rnn_weights): self._create_network(vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size) + rnn_layer_size, use_gru, share_rnn_weights) self._create_parameters(pretrained_model_path) self._inferer = None self._loss_inferer = None @@ -141,7 +146,7 @@ class DeepSpeech2Model(object): # run inference return self._loss_inferer.infer(input=infer_data) - def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta, + def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta, beam_size, cutoff_prob, vocab_list, language_model_path, num_processes): """Model inference. Infer the transcription for a batch of speech @@ -151,9 +156,9 @@ class DeepSpeech2Model(object): consisting of a tuple of audio features and transcription text (empty string). :type infer_data: list - :param decode_method: Decoding method name, 'best_path' or - 'beam search'. - :param decode_method: string + :param decoding_method: Decoding method name, 'ctc_greedy' or + 'ctc_beam_search'. + :param decoding_method: string :param beam_alpha: Parameter associated with language model. :type beam_alpha: float :param beam_beta: Parameter associated with word count. @@ -185,13 +190,13 @@ class DeepSpeech2Model(object): ] # run decoder results = [] - if decode_method == "best_path": + if decoding_method == "ctc_greedy": # best path decode for i, probs in enumerate(probs_split): - output_transcription = ctc_best_path_decoder( + output_transcription = ctc_greedy_decoder( probs_seq=probs, vocabulary=vocab_list) results.append(output_transcription) - elif decode_method == "beam_search": + elif decoding_method == "ctc_beam_search": # initialize external scorer if self._ext_scorer == None: self._ext_scorer = LmScorer(beam_alpha, beam_beta, @@ -200,7 +205,6 @@ class DeepSpeech2Model(object): else: self._ext_scorer.reset_params(beam_alpha, beam_beta) assert self._loaded_lm_path == language_model_path - # beam search decode beam_search_results = ctc_beam_search_decoder_batch( probs_split=probs_split, @@ -214,7 +218,7 @@ class DeepSpeech2Model(object): results = [result[0][1] for result in beam_search_results] else: raise ValueError("Decoding method [%s] is not supported." % - decode_method) + decoding_method) return results def _create_parameters(self, model_path=None): @@ -226,7 +230,7 @@ class DeepSpeech2Model(object): gzip.open(model_path)) def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size): + rnn_layer_size, use_gru, share_rnn_weights): """Create data layers and model network.""" # paddle.data_type.dense_array is used for variable batch input. # The size 161 * 161 is only an placeholder value and the real shape @@ -237,10 +241,12 @@ class DeepSpeech2Model(object): text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(vocab_size)) - self._log_probs, self._loss = deep_speech2( + self._log_probs, self._loss = deep_speech_v2_network( audio_data=audio_data, text_data=text_data, dict_size=vocab_size, num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, - rnn_size=rnn_layer_size) + rnn_size=rnn_layer_size, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) diff --git a/models/network.py b/models/network.py new file mode 100644 index 00000000..13ba5d2c --- /dev/null +++ b/models/network.py @@ -0,0 +1,274 @@ +"""Contains DeepSpeech2 layers and networks.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.v2 as paddle + + +def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, + padding, act): + """Convolution layer with batch normalization. + + :param input: Input layer. + :type input: LayerOutput + :param filter_size: The x dimension of a filter kernel. Or input a tuple for + two image dimension. + :type filter_size: int|tuple|list + :param num_channels_in: Number of input channels. + :type num_channels_in: int + :type num_channels_out: Number of output channels. + :type num_channels_in: out + :param padding: The x dimension of the padding. Or input a tuple for two + image dimension. + :type padding: int|tuple|list + :param act: Activation type. + :type act: BaseActivation + :return: Batch norm layer after convolution layer. + :rtype: LayerOutput + """ + conv_layer = paddle.layer.img_conv( + input=input, + filter_size=filter_size, + num_channels=num_channels_in, + num_filters=num_channels_out, + stride=stride, + padding=padding, + act=paddle.activation.Linear(), + bias_attr=False) + return paddle.layer.batch_norm(input=conv_layer, act=act) + + +def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights): + """Bidirectonal simple rnn layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + + :param name: Name of the layer. + :type name: string + :param input: Input layer. + :type input: LayerOutput + :param size: Number of RNN cells. + :type size: int + :param act: Activation type. + :type act: BaseActivation + :param share_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + :type share_weights: bool + :return: Bidirectional simple rnn layer. + :rtype: LayerOutput + """ + if share_weights: + # input-hidden weights shared between bi-direcitonal rnn. + input_proj = paddle.layer.fc( + input=input, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + # batch norm is only performed on input-state projection + input_proj_bn = paddle.layer.batch_norm( + input=input_proj, act=paddle.activation.Linear()) + # forward and backward in time + forward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=False) + backward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=True) + + else: + input_proj_forward = paddle.layer.fc( + input=input, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + input_proj_backward = paddle.layer.fc( + input=input, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + # batch norm is only performed on input-state projection + input_proj_bn_forward = paddle.layer.batch_norm( + input=input_proj_forward, act=paddle.activation.Linear()) + input_proj_bn_backward = paddle.layer.batch_norm( + input=input_proj_backward, act=paddle.activation.Linear()) + # forward and backward in time + forward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn_forward, act=act, reverse=False) + backward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn_backward, act=act, reverse=True) + + return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) + + +def bidirectional_gru_bn_layer(name, input, size, act): + """Bidirectonal gru layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + + :param name: Name of the layer. + :type name: string + :param input: Input layer. + :type input: LayerOutput + :param size: Number of RNN cells. + :type size: int + :param act: Activation type. + :type act: BaseActivation + :return: Bidirectional simple rnn layer. + :rtype: LayerOutput + """ + input_proj_forward = paddle.layer.fc( + input=input, + size=size * 3, + act=paddle.activation.Linear(), + bias_attr=False) + input_proj_backward = paddle.layer.fc( + input=input, + size=size * 3, + act=paddle.activation.Linear(), + bias_attr=False) + # batch norm is only performed on input-related projections + input_proj_bn_forward = paddle.layer.batch_norm( + input=input_proj_forward, act=paddle.activation.Linear()) + input_proj_bn_backward = paddle.layer.batch_norm( + input=input_proj_backward, act=paddle.activation.Linear()) + # forward and backward in time + forward_gru = paddle.layer.grumemory( + input=input_proj_bn_forward, act=act, reverse=False) + backward_gru = paddle.layer.grumemory( + input=input_proj_bn_backward, act=act, reverse=True) + return paddle.layer.concat(input=[forward_gru, backward_gru]) + + +def conv_group(input, num_stacks): + """Convolution group with stacked convolution layers. + + :param input: Input layer. + :type input: LayerOutput + :param num_stacks: Number of stacked convolution layers. + :type num_stacks: int + :return: Output layer of the convolution group. + :rtype: LayerOutput + """ + conv = conv_bn_layer( + input=input, + filter_size=(11, 41), + num_channels_in=1, + num_channels_out=32, + stride=(3, 2), + padding=(5, 20), + act=paddle.activation.BRelu()) + for i in xrange(num_stacks - 1): + conv = conv_bn_layer( + input=conv, + filter_size=(11, 21), + num_channels_in=32, + num_channels_out=32, + stride=(1, 2), + padding=(5, 10), + act=paddle.activation.BRelu()) + output_num_channels = 32 + output_height = 160 // pow(2, num_stacks) + 1 + return conv, output_num_channels, output_height + + +def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights): + """RNN group with stacked bidirectional simple RNN layers. + + :param input: Input layer. + :type input: LayerOutput + :param size: Number of RNN cells in each layer. + :type size: int + :param num_stacks: Number of stacked rnn layers. + :type num_stacks: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + It is only available when use_gru=False. + :type share_weights: bool + :return: Output layer of the RNN group. + :rtype: LayerOutput + """ + output = input + for i in xrange(num_stacks): + if use_gru: + output = bidirectional_gru_bn_layer( + name=str(i), + input=output, + size=size, + act=paddle.activation.Relu()) + # BRelu does not support hppl, need to add later. Use Relu instead. + else: + output = bidirectional_simple_rnn_bn_layer( + name=str(i), + input=output, + size=size, + act=paddle.activation.BRelu(), + share_weights=share_rnn_weights) + return output + + +def deep_speech_v2_network(audio_data, + text_data, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=256, + use_gru=False, + share_rnn_weights=True): + """The DeepSpeech2 network structure. + + :param audio_data: Audio spectrogram data layer. + :type audio_data: LayerOutput + :param text_data: Transcription text data layer. + :type text_data: LayerOutput + :param dict_size: Dictionary size for tokenized transcription. + :type dict_size: int + :param num_conv_layers: Number of stacking convolution layers. + :type num_conv_layers: int + :param num_rnn_layers: Number of stacking RNN layers. + :type num_rnn_layers: int + :param rnn_size: RNN layer size (number of RNN cells). + :type rnn_size: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward direction RNNs. + It is only available when use_gru=False. + :type share_weights: bool + :return: A tuple of an output unnormalized log probability layer ( + before softmax) and a ctc cost layer. + :rtype: tuple of LayerOutput + """ + # convolution group + conv_group_output, conv_group_num_channels, conv_group_height = conv_group( + input=audio_data, num_stacks=num_conv_layers) + # convert data form convolution feature map to sequence of vectors + conv2seq = paddle.layer.block_expand( + input=conv_group_output, + num_channels=conv_group_num_channels, + stride_x=1, + stride_y=1, + block_x=1, + block_y=conv_group_height) + # rnn group + rnn_group_output = rnn_group( + input=conv2seq, + size=rnn_size, + num_stacks=num_rnn_layers, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) + fc = paddle.layer.fc( + input=rnn_group_output, + size=dict_size + 1, + act=paddle.activation.Linear(), + bias_attr=True) + # probability distribution with softmax + log_probs = paddle.layer.mixed( + input=paddle.layer.identity_projection(input=fc), + act=paddle.activation.Softmax()) + # ctc cost + ctc_loss = paddle.layer.warp_ctc( + input=fc, + label=text_data, + size=dict_size + 1, + blank=dict_size, + norm_by_times=True) + return log_probs, ctc_loss diff --git a/tests/test_decoders.py b/models/tests/test_decoders.py similarity index 80% rename from tests/test_decoders.py rename to models/tests/test_decoders.py index 99d8a828..acce46af 100644 --- a/tests/test_decoders.py +++ b/models/tests/test_decoders.py @@ -4,7 +4,7 @@ from __future__ import division from __future__ import print_function import unittest -from decoder import * +from models import decoder class TestDecoders(unittest.TestCase): @@ -49,19 +49,21 @@ class TestDecoders(unittest.TestCase): 0.15882358, 0.1235788, 0.23376776, 0.20510435, 0.00279306, 0.05294827, 0.22298418 ]] - self.best_path_result = ["ac'bdc", "b'da"] + self.greedy_result = ["ac'bdc", "b'da"] self.beam_search_result = ['acdc', "b'a"] - def test_best_path_decoder_1(self): - bst_result = ctc_best_path_decoder(self.probs_seq1, self.vocab_list) - self.assertEqual(bst_result, self.best_path_result[0]) + def test_greedy_decoder_1(self): + bst_result = decoder.ctc_greedy_decoder(self.probs_seq1, + self.vocab_list) + self.assertEqual(bst_result, self.greedy_result[0]) - def test_best_path_decoder_2(self): - bst_result = ctc_best_path_decoder(self.probs_seq2, self.vocab_list) - self.assertEqual(bst_result, self.best_path_result[1]) + def test_greedy_decoder_2(self): + bst_result = decoder.ctc_greedy_decoder(self.probs_seq2, + self.vocab_list) + self.assertEqual(bst_result, self.greedy_result[1]) def test_beam_search_decoder_1(self): - beam_result = ctc_beam_search_decoder( + beam_result = decoder.ctc_beam_search_decoder( probs_seq=self.probs_seq1, beam_size=self.beam_size, vocabulary=self.vocab_list, @@ -69,7 +71,7 @@ class TestDecoders(unittest.TestCase): self.assertEqual(beam_result[0][1], self.beam_search_result[0]) def test_beam_search_decoder_2(self): - beam_result = ctc_beam_search_decoder( + beam_result = decoder.ctc_beam_search_decoder( probs_seq=self.probs_seq2, beam_size=self.beam_size, vocabulary=self.vocab_list, @@ -77,7 +79,7 @@ class TestDecoders(unittest.TestCase): self.assertEqual(beam_result[0][1], self.beam_search_result[1]) def test_beam_search_decoder_batch(self): - beam_results = ctc_beam_search_decoder_batch( + beam_results = decoder.ctc_beam_search_decoder_batch( probs_split=[self.probs_seq1, self.probs_seq2], beam_size=self.beam_size, vocabulary=self.vocab_list, diff --git a/test.py b/test.py new file mode 100644 index 00000000..747e40df --- /dev/null +++ b/test.py @@ -0,0 +1,121 @@ +"""Evaluation for DeepSpeech2 model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import functools +import paddle.v2 as paddle +from data_utils.data import DataGenerator +from models.model import DeepSpeech2Model +from utils.error_rate import wer, cer +from utils.utility import add_arguments, print_arguments + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('batch_size', int, 128, "Minibatch size.") +add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.") +add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.") +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +add_arg('test_manifest', str, + 'data/librispeech/manifest.test-clean', + "Filepath of manifest to evaluate.") +add_arg('mean_std_path', str, + 'data/librispeech/mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'data/librispeech/eng_vocab.txt', + "Filepath of vocabulary.") +add_arg('model_path', str, + './checkpoints/params.latest.tar.gz', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('decoding_method', str, + 'ctc_beam_search', + "Decoding method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +add_arg('error_rate_type', str, + 'wer', + "Error rate type for evaluation.", + choices=['wer', 'cer']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# yapf: disable +args = parser.parse_args() + + +def evaluate(): + """Evaluate on whole test data for DeepSpeech2.""" + data_generator = DataGenerator( + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, + augmentation_config='{}', + specgram_type=args.specgram_type, + num_threads=args.num_proc_data) + batch_reader = data_generator.batch_reader_creator( + manifest_path=args.test_manifest, + batch_size=args.batch_size, + min_batch_size=1, + sortagrad=False, + shuffle_method=None) + + ds2_model = DeepSpeech2Model( + vocab_size=data_generator.vocab_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_layer_size=args.rnn_layer_size, + use_gru=args.use_gru, + pretrained_model_path=args.model_path, + share_rnn_weights=args.share_rnn_weights) + + error_rate_func = cer if args.error_rate_type == 'cer' else wer + error_sum, num_ins = 0.0, 0 + for infer_data in batch_reader(): + result_transcripts = ds2_model.infer_batch( + infer_data=infer_data, + decoding_method=args.decoding_method, + beam_alpha=args.alpha, + beam_beta=args.beta, + beam_size=args.beam_size, + cutoff_prob=args.cutoff_prob, + vocab_list=data_generator.vocab_list, + language_model_path=args.lang_model_path, + num_processes=args.num_proc_bsearch) + target_transcripts = [ + ''.join([data_generator.vocab_list[token] for token in transcript]) + for _, transcript in infer_data + ] + for target, result in zip(target_transcripts, result_transcripts): + error_sum += error_rate_func(target, result) + num_ins += 1 + print("Error rate [%s] (%d/?) = %f" % + (args.error_rate_type, num_ins, error_sum / num_ins)) + print("Final error rate [%s] (%d/%d) = %f" % + (args.error_rate_type, num_ins, num_ins, error_sum / num_ins)) + + +def main(): + print_arguments(args) + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + evaluate() + + +if __name__ == '__main__': + main() diff --git a/tests/test_setup.py b/tests/test_setup.py deleted file mode 100644 index 18b9c1a0..00000000 --- a/tests/test_setup.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Test Setup.""" -import unittest -import numpy as np -import os - - -class TestSetup(unittest.TestCase): - def test_soundfile(self): - import soundfile as sf - # floating point data is typically limited to the interval [-1.0, 1.0], - # but smaller/larger values are supported as well - data = np.array([[1.75, -1.75], [1.0, -1.0], [0.5, -0.5], - [0.25, -0.25]]) - file = 'test.wav' - sf.write(file, data, 44100, format='WAV', subtype='FLOAT') - read, fs = sf.read(file) - self.assertTrue(np.all(read == data)) - self.assertEqual(fs, 44100) - os.remove(file) - - -if __name__ == '__main__': - unittest.main() diff --git a/tools/build_vocab.py b/tools/build_vocab.py index 618f2498..6fbb9bdf 100644 --- a/tools/build_vocab.py +++ b/tools/build_vocab.py @@ -7,32 +7,29 @@ from __future__ import division from __future__ import print_function import argparse +import functools import codecs import json from collections import Counter import os.path import _init_paths -from data_utils import utils +from data_utils.utility import read_manifest +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--manifest_paths", - type=str, - help="Manifest paths for building vocabulary." - "You can provide multiple manifest files.", - nargs='+', - required=True) -parser.add_argument( - "--count_threshold", - default=0, - type=int, - help="Characters whose counts are below the threshold will be truncated. " - "(default: %(default)i)") -parser.add_argument( - "--vocab_path", - default='datasets/vocab/zh_vocab.txt', - type=str, - help="File path to write the vocabulary. (default: %(default)s)") +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('count_threshold', int, 0, "Truncation threshold for char counts.") +add_arg('vocab_path', str, + 'datasets/vocab/zh_vocab.txt', + "Filepath to write the vocabulary.") +add_arg('manifest_paths', str, + None, + "Filepaths of manifests for building vocabulary. " + "You can provide multiple manifest files.", + nargs='+', + required=True) +# yapf: disable args = parser.parse_args() @@ -44,6 +41,8 @@ def count_manifest(counter, manifest_path): def main(): + print_arguments(args) + counter = Counter() for manifest_path in args.manifest_paths: count_manifest(counter, manifest_path) diff --git a/tools/compute_mean_std.py b/tools/compute_mean_std.py index da49eb4c..5bb6be39 100644 --- a/tools/compute_mean_std.py +++ b/tools/compute_mean_std.py @@ -4,48 +4,35 @@ from __future__ import division from __future__ import print_function import argparse +import functools import _init_paths from data_utils.normalizer import FeatureNormalizer from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.featurizer.audio_featurizer import AudioFeaturizer - -parser = argparse.ArgumentParser( - description='Computing mean and stddev for feature normalizer.') -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--manifest_path", - default='datasets/manifest.train', - type=str, - help="Manifest path for computing normalizer's mean and stddev." - "(default: %(default)s)") -parser.add_argument( - "--num_samples", - default=2000, - type=int, - help="Number of samples for computing mean and stddev. " - "(default: %(default)s)") -parser.add_argument( - "--augmentation_config", - default='{}', - type=str, - help="Augmentation configuration in json-format. " - "(default: %(default)s)") -parser.add_argument( - "--output_file", - default='mean_std.npz', - type=str, - help="Filepath to write mean and std to (.npz)." - "(default: %(default)s)") +from utils.utility import add_arguments, print_arguments + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('num_samples', int, 2000, "# of samples to for statistics.") +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +add_arg('manifest_path', str, + 'datasets/manifest.train', + "Filepath of manifest to compute normalizer's mean and stddev.") +add_arg('output_path', str, + 'mean_std.npz', + "Filepath of write mean and stddev to (.npz).") +# yapf: disable args = parser.parse_args() def main(): - augmentation_pipeline = AugmentationPipeline(args.augmentation_config) + print_arguments(args) + + augmentation_pipeline = AugmentationPipeline('{}') audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type) def augment_and_featurize(audio_segment): @@ -57,7 +44,7 @@ def main(): manifest_path=args.manifest_path, featurize_func=augment_and_featurize, num_samples=args.num_samples) - normalizer.write_to_file(args.output_file) + normalizer.write_to_file(args.output_path) if __name__ == '__main__': diff --git a/tools/tune.py b/tools/tune.py new file mode 100644 index 00000000..7a237910 --- /dev/null +++ b/tools/tune.py @@ -0,0 +1,131 @@ +"""Beam search parameters tuning for DeepSpeech2 model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import functools +import paddle.v2 as paddle +import _init_paths +from data_utils.data import DataGenerator +from models.model import DeepSpeech2Model +from utils.error_rate import wer +from utils.utility import add_arguments, print_arguments + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('num_samples', int, 100, "# of samples to infer.") +add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.") +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('num_alphas', int, 14, "# of alpha candidates for tuning.") +add_arg('num_betas', int, 20, "# of beta candidates for tuning.") +add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.") +add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.") +add_arg('beta_from', float, 0.05, "Where beta starts tuning from.") +add_arg('beta_to', float, 1.0, "Where beta ends tuning with.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +add_arg('tune_manifest', str, + 'data/librispeech/manifest.dev', + "Filepath of manifest to tune.") +add_arg('mean_std_path', str, + 'data/librispeech/mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'data/librispeech/eng_vocab.txt', + "Filepath of vocabulary.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('model_path', str, + './checkpoints/params.latest.tar.gz', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") +add_arg('error_rate_type', str, + 'wer', + "Error rate type for evaluation.", + choices=['wer', 'cer']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# yapf: disable +args = parser.parse_args() + + +def tune(): + """Tune parameters alpha and beta on one minibatch.""" + if not args.num_alphas >= 0: + raise ValueError("num_alphas must be non-negative!") + if not args.num_betas >= 0: + raise ValueError("num_betas must be non-negative!") + + data_generator = DataGenerator( + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, + augmentation_config='{}', + specgram_type=args.specgram_type, + num_threads=1) + batch_reader = data_generator.batch_reader_creator( + manifest_path=args.tune_manifest, + batch_size=args.num_samples, + sortagrad=False, + shuffle_method=None) + tune_data = batch_reader().next() + target_transcripts = [ + ''.join([data_generator.vocab_list[token] for token in transcript]) + for _, transcript in tune_data + ] + + ds2_model = DeepSpeech2Model( + vocab_size=data_generator.vocab_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_layer_size=args.rnn_layer_size, + use_gru=args.use_gru, + pretrained_model_path=args.model_path, + share_rnn_weights=args.share_rnn_weights) + + # create grid for search + cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) + cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) + params_grid = [(alpha, beta) for alpha in cand_alphas + for beta in cand_betas] + + ## tune parameters in loop + for alpha, beta in params_grid: + result_transcripts = ds2_model.infer_batch( + infer_data=tune_data, + decoding_method='ctc_beam_search', + beam_alpha=alpha, + beam_beta=beta, + beam_size=args.beam_size, + cutoff_prob=args.cutoff_prob, + vocab_list=data_generator.vocab_list, + language_model_path=args.lang_model_path, + num_processes=args.num_proc_bsearch) + wer_sum, num_ins = 0.0, 0 + for target, result in zip(target_transcripts, result_transcripts): + wer_sum += wer(target, result) + num_ins += 1 + print("alpha = %f\tbeta = %f\tWER = %f" % + (alpha, beta, wer_sum / num_ins)) + + +def main(): + print_arguments(args) + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + tune() + + +if __name__ == '__main__': + main() diff --git a/train.py b/train.py index 262d8bf0..4a7a0eda 100644 --- a/train.py +++ b/train.py @@ -4,162 +4,91 @@ from __future__ import division from __future__ import print_function import argparse -import distutils.util -import multiprocessing +import functools import paddle.v2 as paddle -from model import DeepSpeech2Model +from models.model import DeepSpeech2Model from data_utils.data import DataGenerator -import utils +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--batch_size", default=256, type=int, help="Minibatch size.") -parser.add_argument( - "--num_passes", - default=200, - type=int, - help="Training pass number. (default: %(default)s)") -parser.add_argument( - "--num_iterations_print", - default=100, - type=int, - help="Number of iterations for every train cost printing. " - "(default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=512, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--adam_learning_rate", - default=5e-4, - type=float, - help="Learning rate for ADAM Optimizer. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--use_sortagrad", - default=True, - type=distutils.util.strtobool, - help="Use sortagrad or not. (default: %(default)s)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--max_duration", - default=27.0, - type=float, - help="Audios with duration larger than this will be discarded. " - "(default: %(default)s)") -parser.add_argument( - "--min_duration", - default=0.0, - type=float, - help="Audios with duration smaller than this will be discarded. " - "(default: %(default)s)") -parser.add_argument( - "--shuffle_method", - default='batch_shuffle_clipped', - type=str, - help="Shuffle method: 'instance_shuffle', 'batch_shuffle', " - "'batch_shuffle_batch'. (default: %(default)s)") -parser.add_argument( - "--trainer_count", - default=8, - type=int, - help="Trainer number. (default: %(default)s)") -parser.add_argument( - "--num_threads_data", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu threads for preprocessing data. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--train_manifest_path", - default='datasets/manifest.train', - type=str, - help="Manifest path for training. (default: %(default)s)") -parser.add_argument( - "--dev_manifest_path", - default='datasets/manifest.dev', - type=str, - help="Manifest path for validation. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--init_model_path", - default=None, - type=str, - help="If set None, the training will start from scratch. " - "Otherwise, the training will resume from " - "the existing model of this path. (default: %(default)s)") -parser.add_argument( - "--output_model_dir", - default="./checkpoints", - type=str, - help="Directory for saving models. (default: %(default)s)") -parser.add_argument( - "--augmentation_config", - default=open('conf/augmentation.config', 'r').read(), - type=str, - help="Augmentation configuration in json-format. " - "(default: %(default)s)") -parser.add_argument( - "--is_local", - default=True, - type=distutils.util.strtobool, - help="Set to false if running with pserver in paddlecloud. " - "(default: %(default)s)") +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('batch_size', int, 256, "Minibatch size.") +add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") +add_arg('num_passes', int, 200, "# of training epochs.") +add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.") +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('num_iter_print', int, 100, "Every # iterations for printing " + "train cost.") +add_arg('learning_rate', float, 5e-4, "Learning rate.") +add_arg('max_duration', float, 27.0, "Longest audio duration allowed.") +add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.") +add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") +add_arg('is_local', bool, True, "Use pserver or not.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +add_arg('train_manifest', str, + 'data/librispeech/manifest.train', + "Filepath of train manifest.") +add_arg('dev_manifest', str, + 'data/librispeech/manifest.dev-clean', + "Filepath of validation manifest.") +add_arg('mean_std_path', str, + 'data/librispeech/mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'data/librispeech/eng_vocab.txt', + "Filepath of vocabulary.") +add_arg('init_model_path', str, + None, + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") +add_arg('output_model_dir', str, + "./checkpoints", + "Directory for saving checkpoints.") +add_arg('augment_conf_path',str, + 'conf/augmentation.config', + "Filepath of augmentation configuration file (json-format).") +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +add_arg('shuffle_method', str, + 'batch_shuffle_clipped', + "Shuffle method.", + choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped']) +# yapf: disable args = parser.parse_args() def train(): """DeepSpeech2 training.""" train_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, - augmentation_config=args.augmentation_config, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, + augmentation_config=open(args.augment_conf_path, 'r').read(), max_duration=args.max_duration, min_duration=args.min_duration, specgram_type=args.specgram_type, - num_threads=args.num_threads_data) + num_threads=args.num_proc_data) dev_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, augmentation_config="{}", specgram_type=args.specgram_type, - num_threads=args.num_threads_data) + num_threads=args.num_proc_data) train_batch_reader = train_generator.batch_reader_creator( - manifest_path=args.train_manifest_path, + manifest_path=args.train_manifest, batch_size=args.batch_size, min_batch_size=args.trainer_count, sortagrad=args.use_sortagrad if args.init_model_path is None else False, shuffle_method=args.shuffle_method) dev_batch_reader = dev_generator.batch_reader_creator( - manifest_path=args.dev_manifest_path, + manifest_path=args.dev_manifest, batch_size=args.batch_size, min_batch_size=1, # must be 1, but will have errors. sortagrad=False, @@ -170,21 +99,23 @@ def train(): num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, - pretrained_model_path=args.init_model_path) + use_gru=args.use_gru, + pretrained_model_path=args.init_model_path, + share_rnn_weights=args.share_rnn_weights) ds2_model.train( train_batch_reader=train_batch_reader, dev_batch_reader=dev_batch_reader, feeding_dict=train_generator.feeding, - learning_rate=args.adam_learning_rate, + learning_rate=args.learning_rate, gradient_clipping=400, num_passes=args.num_passes, - num_iterations_print=args.num_iterations_print, + num_iterations_print=args.num_iter_print, output_model_dir=args.output_model_dir, is_local=args.is_local) def main(): - utils.print_arguments(args) + print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) train() diff --git a/tune.py b/tune.py deleted file mode 100644 index 328d67a1..00000000 --- a/tune.py +++ /dev/null @@ -1,196 +0,0 @@ -"""Parameters tuning for DeepSpeech2 model.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import distutils.util -import argparse -import multiprocessing -import paddle.v2 as paddle -from data_utils.data import DataGenerator -from model import DeepSpeech2Model -from error_rate import wer -import utils - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--num_samples", - default=100, - type=int, - help="Number of samples for parameters tuning. (default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=512, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--trainer_count", - default=8, - type=int, - help="Trainer number. (default: %(default)s)") -parser.add_argument( - "--num_threads_data", - default=1, - type=int, - help="Number of cpu threads for preprocessing data. (default: %(default)s)") -parser.add_argument( - "--num_processes_beam_search", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu processes for beam search. (default: %(default)s)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--tune_manifest_path", - default='datasets/manifest.dev', - type=str, - help="Manifest path for tuning. (default: %(default)s)") -parser.add_argument( - "--model_filepath", - default='checkpoints/params.latest.tar.gz', - type=str, - help="Model filepath. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--beam_size", - default=500, - type=int, - help="Width for beam search decoding. (default: %(default)d)") -parser.add_argument( - "--language_model_path", - default="lm/data/common_crawl_00.prune01111.trie.klm", - type=str, - help="Path for language model. (default: %(default)s)") -parser.add_argument( - "--alpha_from", - default=0.1, - type=float, - help="Where alpha starts from. (default: %(default)f)") -parser.add_argument( - "--num_alphas", - default=14, - type=int, - help="Number of candidate alphas. (default: %(default)d)") -parser.add_argument( - "--alpha_to", - default=0.36, - type=float, - help="Where alpha ends with. (default: %(default)f)") -parser.add_argument( - "--beta_from", - default=0.05, - type=float, - help="Where beta starts from. (default: %(default)f)") -parser.add_argument( - "--num_betas", - default=20, - type=float, - help="Number of candidate betas. (default: %(default)d)") -parser.add_argument( - "--beta_to", - default=1.0, - type=float, - help="Where beta ends with. (default: %(default)f)") -parser.add_argument( - "--cutoff_prob", - default=0.99, - type=float, - help="The cutoff probability of pruning" - "in beam search. (default: %(default)f)") -args = parser.parse_args() - - -def tune(): - """Tune parameters alpha and beta on one minibatch.""" - if not args.num_alphas >= 0: - raise ValueError("num_alphas must be non-negative!") - if not args.num_betas >= 0: - raise ValueError("num_betas must be non-negative!") - - data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, - augmentation_config='{}', - specgram_type=args.specgram_type, - num_threads=args.num_threads_data) - batch_reader = data_generator.batch_reader_creator( - manifest_path=args.tune_manifest_path, - batch_size=args.num_samples, - sortagrad=False, - shuffle_method=None) - tune_data = batch_reader().next() - target_transcripts = [ - ''.join([data_generator.vocab_list[token] for token in transcript]) - for _, transcript in tune_data - ] - - ds2_model = DeepSpeech2Model( - vocab_size=data_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_layer_size=args.rnn_layer_size, - pretrained_model_path=args.model_filepath) - - # create grid for search - cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) - cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) - params_grid = [(alpha, beta) for alpha in cand_alphas - for beta in cand_betas] - - ## tune parameters in loop - for alpha, beta in params_grid: - result_transcripts = ds2_model.infer_batch( - infer_data=tune_data, - decode_method='beam_search', - beam_alpha=alpha, - beam_beta=beta, - beam_size=args.beam_size, - cutoff_prob=args.cutoff_prob, - vocab_list=data_generator.vocab_list, - language_model_path=args.language_model_path, - num_processes=args.num_processes_beam_search) - wer_sum, num_ins = 0.0, 0 - for target, result in zip(target_transcripts, result_transcripts): - wer_sum += wer(target, result) - num_ins += 1 - print("alpha = %f\tbeta = %f\tWER = %f" % - (alpha, beta, wer_sum / num_ins)) - - -def main(): - utils.print_arguments(args) - paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) - tune() - - -if __name__ == '__main__': - main() diff --git a/utils.py b/utils.py deleted file mode 100644 index 9ca363c8..00000000 --- a/utils.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Contains common utility functions.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - -def print_arguments(args): - """Print argparse's arguments. - - Usage: - - .. code-block:: python - - parser = argparse.ArgumentParser() - parser.add_argument("name", default="Jonh", type=str, help="User name.") - args = parser.parse_args() - print_arguments(args) - - :param args: Input argparse.Namespace for printing. - :type args: argparse.Namespace - """ - print("----- Configuration Arguments -----") - for arg, value in vars(args).iteritems(): - print("%s: %s" % (arg, value)) - print("------------------------------------") diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/error_rate.py b/utils/error_rate.py similarity index 100% rename from error_rate.py rename to utils/error_rate.py diff --git a/tests/test_error_rate.py b/utils/tests/test_error_rate.py similarity index 99% rename from tests/test_error_rate.py rename to utils/tests/test_error_rate.py index 99e137a9..d6bc7442 100644 --- a/tests/test_error_rate.py +++ b/utils/tests/test_error_rate.py @@ -5,7 +5,7 @@ from __future__ import division from __future__ import print_function import unittest -import error_rate +from utils import error_rate class TestParse(unittest.TestCase): diff --git a/utils/utility.py b/utils/utility.py new file mode 100644 index 00000000..2e489ade --- /dev/null +++ b/utils/utility.py @@ -0,0 +1,47 @@ +"""Contains common utility functions.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import distutils.util + + +def print_arguments(args): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs)