add tune
fix gru model bugs
add dataset and model test
pull/522/head
Hui Zhang 5 years ago
parent 16dcdf34dc
commit c3a6d19af9

@ -26,7 +26,7 @@ script:
- exit_code=0 - exit_code=0
- .travis/precommit.sh || exit_code=$(( exit_code | $? )) - .travis/precommit.sh || exit_code=$(( exit_code | $? ))
- docker run -i --rm -v "$PWD:/py_unittest" paddlepaddle/paddle:latest /bin/bash -c - docker run -i --rm -v "$PWD:/py_unittest" paddlepaddle/paddle:latest /bin/bash -c
'cd /py_unittest; sh .travis/unittest.sh' || exit_code=$(( exit_code | $? )) 'cd /py_unittest; source env.sh; bash .travis/unittest.sh' || exit_code=$(( exit_code | $? ))
exit $exit_code exit $exit_code
notifications: notifications:

@ -0,0 +1,389 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "emerging-meter",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" def convert_to_list(value, n, name, dtype=np.int):\n",
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n",
" from numpy.dual import register_func\n",
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n",
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" long_ = _make_signed(np.long)\n",
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" ulong = _make_unsigned(np.long)\n"
]
}
],
"source": [
"import math\n",
"import random\n",
"import tarfile\n",
"import logging\n",
"import numpy as np\n",
"from collections import namedtuple\n",
"from functools import partial\n",
"\n",
"import paddle\n",
"from paddle.io import Dataset\n",
"from paddle.io import DataLoader\n",
"from paddle.io import BatchSampler\n",
"from paddle.io import DistributedBatchSampler\n",
"from paddle import distributed as dist\n",
"\n",
"from data_utils.utility import read_manifest\n",
"from data_utils.augmentor.augmentation import AugmentationPipeline\n",
"from data_utils.featurizer.speech_featurizer import SpeechFeaturizer\n",
"from data_utils.speech import SpeechSegment\n",
"from data_utils.normalizer import FeatureNormalizer\n",
"\n",
"\n",
"from data_utils.dataset import (\n",
" DeepSpeech2Dataset,\n",
" DeepSpeech2DistributedBatchSampler,\n",
" DeepSpeech2BatchSampler,\n",
" SpeechCollator,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "excessive-american",
"metadata": {},
"outputs": [],
"source": [
"def create_dataloader(manifest_path,\t\n",
" vocab_filepath,\t\n",
" mean_std_filepath,\t\n",
" augmentation_config='{}',\t\n",
" max_duration=float('inf'),\t\n",
" min_duration=0.0,\t\n",
" stride_ms=10.0,\t\n",
" window_ms=20.0,\t\n",
" max_freq=None,\t\n",
" specgram_type='linear',\t\n",
" use_dB_normalization=True,\t\n",
" random_seed=0,\t\n",
" keep_transcription_text=False,\t\n",
" is_training=False,\t\n",
" batch_size=1,\t\n",
" num_workers=0,\t\n",
" sortagrad=False,\t\n",
" shuffle_method=None,\t\n",
" dist=False):\t\n",
"\n",
" dataset = DeepSpeech2Dataset(\t\n",
" manifest_path,\t\n",
" vocab_filepath,\t\n",
" mean_std_filepath,\t\n",
" augmentation_config=augmentation_config,\t\n",
" max_duration=max_duration,\t\n",
" min_duration=min_duration,\t\n",
" stride_ms=stride_ms,\t\n",
" window_ms=window_ms,\t\n",
" max_freq=max_freq,\t\n",
" specgram_type=specgram_type,\t\n",
" use_dB_normalization=use_dB_normalization,\t\n",
" random_seed=random_seed,\t\n",
" keep_transcription_text=keep_transcription_text)\t\n",
"\n",
" if dist:\t\n",
" batch_sampler = DeepSpeech2DistributedBatchSampler(\t\n",
" dataset,\t\n",
" batch_size,\t\n",
" num_replicas=None,\t\n",
" rank=None,\t\n",
" shuffle=is_training,\t\n",
" drop_last=is_training,\t\n",
" sortagrad=is_training,\t\n",
" shuffle_method=shuffle_method)\t\n",
" else:\t\n",
" batch_sampler = DeepSpeech2BatchSampler(\t\n",
" dataset,\t\n",
" shuffle=is_training,\t\n",
" batch_size=batch_size,\t\n",
" drop_last=is_training,\t\n",
" sortagrad=is_training,\t\n",
" shuffle_method=shuffle_method)\t\n",
"\n",
" def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):\t\n",
" \"\"\"\t\n",
" Padding audio features with zeros to make them have the same shape (or\t\n",
" a user-defined shape) within one bach.\t\n",
"\n",
" If ``padding_to`` is -1, the maximun shape in the batch will be used\t\n",
" as the target shape for padding. Otherwise, `padding_to` will be the\t\n",
" target shape (only refers to the second axis).\t\n",
"\n",
" If `flatten` is True, features will be flatten to 1darray.\t\n",
" \"\"\"\t\n",
" new_batch = []\t\n",
" # get target shape\t\n",
" max_length = max([audio.shape[1] for audio, text in batch])\t\n",
" if padding_to != -1:\t\n",
" if padding_to < max_length:\t\n",
" raise ValueError(\"If padding_to is not -1, it should be larger \"\t\n",
" \"than any instance's shape in the batch\")\t\n",
" max_length = padding_to\t\n",
" max_text_length = max([len(text) for audio, text in batch])\t\n",
" # padding\t\n",
" padded_audios = []\t\n",
" audio_lens = []\t\n",
" texts, text_lens = [], []\t\n",
" for audio, text in batch:\t\n",
" padded_audio = np.zeros([audio.shape[0], max_length])\t\n",
" padded_audio[:, :audio.shape[1]] = audio\t\n",
" if flatten:\t\n",
" padded_audio = padded_audio.flatten()\t\n",
" padded_audios.append(padded_audio)\t\n",
" audio_lens.append(audio.shape[1])\t\n",
"\n",
" padded_text = np.zeros([max_text_length])\n",
" if is_training:\n",
" padded_text[:len(text)] = text\t# ids\n",
" else:\n",
" padded_text[:len(text)] = [ord(t) for t in text] # string\n",
" \n",
" texts.append(padded_text)\t\n",
" text_lens.append(len(text))\t\n",
"\n",
" padded_audios = np.array(padded_audios).astype('float32')\t\n",
" audio_lens = np.array(audio_lens).astype('int64')\t\n",
" texts = np.array(texts).astype('int32')\t\n",
" text_lens = np.array(text_lens).astype('int64')\t\n",
" return padded_audios, texts, audio_lens, text_lens\t\n",
"\n",
" loader = DataLoader(\t\n",
" dataset,\t\n",
" batch_sampler=batch_sampler,\t\n",
" collate_fn=partial(padding_batch, is_training=is_training),\t\n",
" num_workers=num_workers)\t\n",
" return loader"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "naval-brave",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'infer_manifest': 'examples/aishell/data/manifest.dev', 'mean_std_path': 'examples/aishell/data/mean_std.npz', 'vocab_path': 'examples/aishell/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/aishell/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'linear'}\n"
]
}
],
"source": [
"import sys\n",
"import argparse\n",
"import functools\n",
"from utils.utility import add_arguments, print_arguments\n",
"parser = argparse.ArgumentParser(description=__doc__)\n",
"add_arg = functools.partial(add_arguments, argparser=parser)\n",
"# yapf: disable\n",
"add_arg('num_samples', int, 5, \"# of samples to infer.\")\n",
"add_arg('beam_size', int, 500, \"Beam search width.\")\n",
"add_arg('num_proc_bsearch', int, 8, \"# of CPUs for beam search.\")\n",
"add_arg('num_conv_layers', int, 2, \"# of convolution layers.\")\n",
"add_arg('num_rnn_layers', int, 3, \"# of recurrent layers.\")\n",
"add_arg('rnn_layer_size', int, 2048, \"# of recurrent cells per layer.\")\n",
"add_arg('alpha', float, 2.5, \"Coef of LM for beam search.\")\n",
"add_arg('beta', float, 0.3, \"Coef of WC for beam search.\")\n",
"add_arg('cutoff_prob', float, 1.0, \"Cutoff probability for pruning.\")\n",
"add_arg('cutoff_top_n', int, 40, \"Cutoff number for pruning.\")\n",
"add_arg('use_gru', bool, False, \"Use GRUs instead of simple RNNs.\")\n",
"add_arg('use_gpu', bool, True, \"Use GPU or not.\")\n",
"add_arg('share_rnn_weights',bool, True, \"Share input-hidden weights across \"\n",
" \"bi-directional RNNs. Not for GRU.\")\n",
"add_arg('infer_manifest', str,\n",
" 'examples/aishell/data/manifest.dev',\n",
" \"Filepath of manifest to infer.\")\n",
"add_arg('mean_std_path', str,\n",
" 'examples/aishell/data/mean_std.npz',\n",
" \"Filepath of normalizer's mean & std.\")\n",
"add_arg('vocab_path', str,\n",
" 'examples/aishell/data/vocab.txt',\n",
" \"Filepath of vocabulary.\")\n",
"add_arg('lang_model_path', str,\n",
" 'models/lm/common_crawl_00.prune01111.trie.klm',\n",
" \"Filepath for language model.\")\n",
"add_arg('model_path', str,\n",
" 'examples/aishell/checkpoints/step_final',\n",
" \"If None, the training starts from scratch, \"\n",
" \"otherwise, it resumes from the pre-trained model.\")\n",
"add_arg('decoding_method', str,\n",
" 'ctc_beam_search',\n",
" \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n",
" choices = ['ctc_beam_search', 'ctc_greedy'])\n",
"add_arg('error_rate_type', str,\n",
" 'wer',\n",
" \"Error rate type for evaluation.\",\n",
" choices=['wer', 'cer'])\n",
"add_arg('specgram_type', str,\n",
" 'linear',\n",
" \"Audio feature type. Options: linear, mfcc.\",\n",
" choices=['linear', 'mfcc'])\n",
"# yapf: disable\n",
"args = parser.parse_args([])\n",
"print(vars(args))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "bearing-physics",
"metadata": {},
"outputs": [],
"source": [
"batch_reader = create_dataloader(\n",
" manifest_path=args.infer_manifest,\n",
" vocab_filepath=args.vocab_path,\n",
" mean_std_filepath=args.mean_std_path,\n",
" augmentation_config='{}',\n",
" #max_duration=float('inf'),\n",
" max_duration=27.0,\n",
" min_duration=0.0,\n",
" stride_ms=10.0,\n",
" window_ms=20.0,\n",
" max_freq=None,\n",
" specgram_type=args.specgram_type,\n",
" use_dB_normalization=True,\n",
" random_seed=0,\n",
" keep_transcription_text=True,\n",
" is_training=False,\n",
" batch_size=args.num_samples,\n",
" sortagrad=True,\n",
" shuffle_method=None,\n",
" dist=False)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "classified-melissa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"test Tensor(shape=[5, 6], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[22823, 26102, 20195, 37324, 0 , 0 ],\n",
" [22238, 26469, 23601, 22909, 0 , 0 ],\n",
" [20108, 26376, 22235, 26085, 0 , 0 ],\n",
" [36824, 35201, 20445, 25345, 32654, 24863],\n",
" [29042, 27748, 21463, 23456, 0 , 0 ]])\n",
"test raw 大时代里\n",
"test raw 煲汤受宠\n",
"audio len Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [163, 167, 180, 186, 186])\n",
"test len Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
" [4, 4, 4, 6, 4])\n",
"audio Tensor(shape=[5, 161, 186], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[[ 1.11669052, 0.79015088, 0.93658292, ..., 0. , 0. , 0. ],\n",
" [ 0.83549136, 0.72643483, 0.83578080, ..., 0. , 0. , 0. ],\n",
" [-0.89155018, -0.18894747, -0.53357804, ..., 0. , 0. , 0. ],\n",
" ...,\n",
" [ 0.33386710, -0.81240511, 0.12869737, ..., 0. , 0. , 0. ],\n",
" [-0.17537928, 0.58380985, 0.70696265, ..., 0. , 0. , 0. ],\n",
" [-0.84175998, 1.22041416, 0.07929770, ..., 0. , 0. , 0. ]],\n",
"\n",
" [[-0.35964420, 0.77392709, 0.71409988, ..., 0. , 0. , 0. ],\n",
" [-0.15990183, 0.42962283, 0.06222462, ..., 0. , 0. , 0. ],\n",
" [-0.31166190, -0.74864638, -0.52836996, ..., 0. , 0. , 0. ],\n",
" ...,\n",
" [-0.27546275, 0.32889456, 0.12410031, ..., 0. , 0. , 0. ],\n",
" [ 0.16264282, 0.49418071, -0.15960945, ..., 0. , 0. , 0. ],\n",
" [ 0.12476666, 0.00516864, 1.16021466, ..., 0. , 0. , 0. ]],\n",
"\n",
" [[ 0.90202141, 1.48541915, 0.92062062, ..., 0. , 0. , 0. ],\n",
" [ 0.82661545, 1.37171340, 0.86746097, ..., 0. , 0. , 0. ],\n",
" [-0.62287915, -0.48645937, 0.35041964, ..., 0. , 0. , 0. ],\n",
" ...,\n",
" [ 0.07376949, 0.07138316, 0.76355994, ..., 0. , 0. , 0. ],\n",
" [-0.32306790, 0.43247896, 1.27311838, ..., 0. , 0. , 0. ],\n",
" [-0.97667056, 0.60747612, 0.79181534, ..., 0. , 0. , 0. ]],\n",
"\n",
" [[ 0.72022128, 0.95428467, 0.92766261, ..., 0.29105374, -0.45564806, -0.62151009],\n",
" [ 0.42083180, 0.49279949, 0.82724041, ..., -0.17333922, -1.45363355, -0.61673522],\n",
" [-0.76116520, -0.84750438, -0.09512503, ..., -1.01497340, -1.42781055, -0.80859023],\n",
" ...,\n",
" [-0.23009977, 1.06155431, 1.09065628, ..., 0.25581080, 0.53794998, -1.22650719],\n",
" [-1.37693381, 0.30778193, 0.17152318, ..., 0.51650339, 0.25580606, 0.83097816],\n",
" [-1.62180591, 1.30567718, 1.09928656, ..., -0.77590007, 1.27712476, 0.53189957]],\n",
"\n",
" [[ 1.03205252, -0.51535392, 0.21077573, ..., 0.76618457, 1.27425683, 1.52250278],\n",
" [ 0.82059991, 0.43990925, 0.13090958, ..., 0.86662549, 1.01687658, 1.48495352],\n",
" [-0.75489789, -0.01997089, -0.65174174, ..., 0.09061214, -0.55211234, -0.01614586],\n",
" ...,\n",
" [ 0.50985396, 1.84555030, 0.79185146, ..., 1.13666189, 1.19898069, 1.98158395],\n",
" [ 1.98721015, 2.52385354, 1.11714780, ..., 0.19416514, 1.11329341, 0.64460152],\n",
" [ 2.69512844, 1.90993905, 0.50245082, ..., -0.50902629, 0.03333465, -1.24584770]]])\n"
]
}
],
"source": [
"for idx, (audio, text, audio_len, text_len) in enumerate(batch_reader()):\n",
" print('test', text)\n",
" print(\"test raw\", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))\n",
" print(\"test raw\", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))\n",
" print('audio len', audio_len)\n",
" print('test len', text_len)\n",
" print('audio', audio)\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "unexpected-skating",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "minus-modern",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -39,13 +39,13 @@ training:
save_interval: 1000 save_interval: 1000
valid_interval: 1000 valid_interval: 1000
decoding: decoding:
batch_size: 128 batch_size: 10
error_rate_type: cer error_rate_type: cer
decoding_method: ctc_beam_search decoding_method: ctc_beam_search
lang_model_path: models/lm/zh_giga.no_cna_cmn.prune01244.klm lang_model_path: models/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 2.6 alpha: 2.6
beta: 5.0 beta: 5.0
beam_size: 300 beam_size: 300
cutoff_prob: 0.99 cutoff_prob: 1.0
cutoff_top_n: 40 cutoff_top_n: 40
num_proc_bsearch: 8 num_proc_bsearch: 10

@ -13,7 +13,7 @@ python3 -u ${MAIN_ROOT}/infer.py \
--device 'gpu' \ --device 'gpu' \
--nproc 1 \ --nproc 1 \
--config conf/deepspeech2.yaml \ --config conf/deepspeech2.yaml \
--checkpoint_path ckpt/checkpoints/step-3283 --checkpoint_path ${1}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then

@ -5,16 +5,16 @@ python3 -u ${MAIN_ROOT}/tune.py \
--device 'gpu' \ --device 'gpu' \
--nproc 1 \ --nproc 1 \
--config conf/deepspeech2.yaml \ --config conf/deepspeech2.yaml \
--num_batches=-1 \ --num_batches=10 \
--batch_size=128 \ --batch_size=128 \
--beam_size=500 \ --beam_size=300 \
--num_proc_bsearch=12 \ --num_proc_bsearch=8 \
--num_alphas=45 \ --num_alphas=10 \
--num_betas=8 \ --num_betas=10 \
--alpha_from=1.0 \ --alpha_from=0.0 \
--alpha_to=3.2 \ --alpha_to=5.0 \
--beta_from=0.1 \ --beta_from=-6 \
--beta_to=0.45 \ --beta_to=6 \
--cutoff_prob=1.0 \ --cutoff_prob=1.0 \
--cutoff_top_n=40 \ --cutoff_top_n=40 \
--checkpoint_path ${1} --checkpoint_path ${1}

@ -26,7 +26,7 @@ model:
num_conv_layers: 2 num_conv_layers: 2
num_rnn_layers: 3 num_rnn_layers: 3
rnn_layer_size: 2048 rnn_layer_size: 2048
use_gru: True use_gru: False
share_rnn_weights: True share_rnn_weights: True
training: training:
n_epoch: 20 n_epoch: 20

@ -46,7 +46,7 @@ _C.model = CN(
num_conv_layers=2, #Number of stacking convolution layers. num_conv_layers=2, #Number of stacking convolution layers.
num_rnn_layers=3, #Number of stacking RNN layers. num_rnn_layers=3, #Number of stacking RNN layers.
rnn_layer_size=1024, #RNN layer size (number of RNN cells). rnn_layer_size=1024, #RNN layer size (number of RNN cells).
use_gru=False, #Use gru if set True. Use simple rnn if set False. use_gru=True, #Use gru if set True. Use simple rnn if set False.
share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
)) ))

@ -242,6 +242,7 @@ class DeepSpeech2Trainer(Trainer):
num_conv_layers=config.model.num_conv_layers, num_conv_layers=config.model.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers, num_rnn_layers=config.model.num_rnn_layers,
rnn_size=config.model.rnn_layer_size, rnn_size=config.model.rnn_layer_size,
use_gru=config.model.use_gru,
share_rnn_weights=config.model.share_rnn_weights) share_rnn_weights=config.model.share_rnn_weights)
if self.parallel: if self.parallel:
@ -329,7 +330,7 @@ class DeepSpeech2Trainer(Trainer):
sortagrad=config.data.sortagrad, sortagrad=config.data.sortagrad,
shuffle_method=config.data.shuffle_method) shuffle_method=config.data.shuffle_method)
collate_fn = SpeechCollator() collate_fn = SpeechCollator(is_training=True)
self.train_loader = DataLoader( self.train_loader = DataLoader(
train_dataset, train_dataset,
batch_sampler=batch_sampler, batch_sampler=batch_sampler,
@ -449,7 +450,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
else: else:
output_dir = Path( output_dir = Path(
self.args.checkpoint_path).expanduser().parent / "infer" self.args.checkpoint_path).expanduser().parent.parent / "infer"
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
self.output_dir = output_dir self.output_dir = output_dir
@ -485,6 +486,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
num_conv_layers=config.model.num_conv_layers, num_conv_layers=config.model.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers, num_rnn_layers=config.model.num_rnn_layers,
rnn_size=config.model.rnn_layer_size, rnn_size=config.model.rnn_layer_size,
use_gru=config.model.use_gru,
share_rnn_weights=config.model.share_rnn_weights) share_rnn_weights=config.model.share_rnn_weights)
if self.parallel: if self.parallel:
@ -498,6 +500,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
def setup_dataloader(self): def setup_dataloader(self):
config = self.config config = self.config
# return raw text
test_dataset = DeepSpeech2Dataset( test_dataset = DeepSpeech2Dataset(
config.data.test_manifest, config.data.test_manifest,
config.data.vocab_filepath, config.data.vocab_filepath,
@ -516,6 +519,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
random_seed=config.data.random_seed, random_seed=config.data.random_seed,
keep_transcription_text=True) keep_transcription_text=True)
# return text ord id
self.test_loader = DataLoader( self.test_loader = DataLoader(
test_dataset, test_dataset,
batch_size=config.decoding.batch_size, batch_size=config.decoding.batch_size,

@ -22,6 +22,8 @@ from paddle import nn
from paddle.nn import functional as F from paddle.nn import functional as F
from paddle.nn import initializer as I from paddle.nn import initializer as I
from utils import checkpoint
from decoders.swig_wrapper import Scorer from decoders.swig_wrapper import Scorer
from decoders.swig_wrapper import ctc_greedy_decoder from decoders.swig_wrapper import ctc_greedy_decoder
from decoders.swig_wrapper import ctc_beam_search_decoder_batch from decoders.swig_wrapper import ctc_beam_search_decoder_batch
@ -89,7 +91,7 @@ class ConvBn(nn.Layer):
stride=stride, stride=stride,
padding=padding, padding=padding,
weight_attr=None, weight_attr=None,
bias_attr=None, bias_attr=False,
data_format='NCHW') data_format='NCHW')
self.bn = nn.BatchNorm2D( self.bn = nn.BatchNorm2D(
@ -387,6 +389,7 @@ class BiGRUWithBN(nn.Layer):
def __init__(self, i_size, h_size, act): def __init__(self, i_size, h_size, act):
super().__init__() super().__init__()
hidden_size = h_size * 3 hidden_size = h_size * 3
self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
self.fw_bn = nn.BatchNorm1D( self.fw_bn = nn.BatchNorm1D(
hidden_size, bias_attr=None, data_format='NLC') hidden_size, bias_attr=None, data_format='NLC')
@ -494,7 +497,7 @@ class DeepSpeech2(nn.Layer):
dict_size, dict_size,
num_conv_layers=2, num_conv_layers=2,
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=256, rnn_size=1024,
use_gru=False, use_gru=False,
share_rnn_weights=True): share_rnn_weights=True):
super().__init__() super().__init__()
@ -684,9 +687,10 @@ class DeepSpeech2(nn.Layer):
lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
cutoff_top_n, num_processes): cutoff_top_n, num_processes):
_, probs, _ = self.predict(audio, audio_len) _, probs, _ = self.predict(audio, audio_len)
return self.decode_probs( return self.decode_probs(probs.numpy(), vocab_list, decoding_method,
probs, vocab_list, decoding_method, lang_model_path, beam_alpha, lang_model_path, beam_alpha, beam_beta,
beam_beta, beam_size, cutoff_prob, cutoff_top_n, num_processes) beam_size, cutoff_prob, cutoff_top_n,
num_processes)
def from_pretrained(self, checkpoint_path): def from_pretrained(self, checkpoint_path):
"""Build a model from a pretrained model. """Build a model from a pretrained model.
@ -704,7 +708,7 @@ class DeepSpeech2(nn.Layer):
The model build from pretrined result. The model build from pretrined result.
""" """
checkpoint.load_parameters(self, checkpoint_path=checkpoint_path) checkpoint.load_parameters(self, checkpoint_path=checkpoint_path)
return model return
def ctc_loss(logits, def ctc_loss(logits,

@ -63,7 +63,7 @@ if __name__ == '__main__':
rnn_size=1024, rnn_size=1024,
use_gru=True, use_gru=True,
share_rnn_weights=False, ) share_rnn_weights=False, )
probs = model2(audio, text, audio_len, text_len) logits, probs, logits_len = model2(audio, text, audio_len, text_len)
print('probs.shape', probs.shape) print('probs.shape', probs.shape)
print("-----------------") print("-----------------")
@ -75,7 +75,7 @@ if __name__ == '__main__':
rnn_size=1024, rnn_size=1024,
use_gru=False, use_gru=False,
share_rnn_weights=True, ) share_rnn_weights=True, )
probs = model3(audio, text, audio_len, text_len) logits, probs, logits_len = model3(audio, text, audio_len, text_len)
print('probs.shape', probs.shape) print('probs.shape', probs.shape)
print("-----------------") print("-----------------")
@ -87,7 +87,7 @@ if __name__ == '__main__':
rnn_size=1024, rnn_size=1024,
use_gru=True, use_gru=True,
share_rnn_weights=True, ) share_rnn_weights=True, )
probs = model4(audio, text, audio_len, text_len) logits, probs, logits_len = model4(audio, text, audio_len, text_len)
print('probs.shape', probs.shape) print('probs.shape', probs.shape)
print("-----------------") print("-----------------")
@ -99,6 +99,6 @@ if __name__ == '__main__':
rnn_size=1024, rnn_size=1024,
use_gru=False, use_gru=False,
share_rnn_weights=False, ) share_rnn_weights=False, )
probs = model5(audio, text, audio_len, text_len) logits, probs, logits_len = model5(audio, text, audio_len, text_len)
print('probs.shape', probs.shape) print('probs.shape', probs.shape)
print("-----------------") print("-----------------")

@ -267,30 +267,29 @@ class Trainer():
when = 'D' when = 'D'
backup = 7 backup = 7
format = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s' format = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
formatter = logging.Formatter(fmt=format, datefmt='%Y/%m/%d %H:%M:%S')
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel("INFO") logger.setLevel("INFO")
formatter = logging.Formatter(fmt=format, datefmt='%Y/%m/%d %H:%M:%S')
stream_handler = logging.StreamHandler() stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter) stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler) logger.addHandler(stream_handler)
if not hasattr(self, 'output_dir'): # if not hasattr(self, 'output_dir'):
self.logger = logger # self.logger = logger
return # return
log_file = self.output_dir / 'worker_{}.log'.format(dist.get_rank()) log_file = self.output_dir / 'worker_{}.log'.format(dist.get_rank())
# file_handler = logging.FileHandler(str(log_file)) # file_handler = logging.FileHandler(str(log_file))
# file_handler.setFormatter(formatter) # file_handler.setFormatter(formatter)
# logger.addHandler(file_handler) # logger.addHandler(file_handler)
handler = logging.handlers.TimedRotatingFileHandler( # handler = logging.handlers.TimedRotatingFileHandler(
str(self.output_dir / "warning.log"), when=when, backupCount=backup) # str(self.output_dir / "warning.log"), when=when, backupCount=backup)
handler.setLevel(logging.WARNING) # handler.setLevel(logging.WARNING)
handler.setFormatter(formatter) # handler.setFormatter(formatter)
logger.addHandler(handler) # logger.addHandler(handler)
# global logger # global logger
stdout = False stdout = False

@ -21,107 +21,69 @@ import functools
import gzip import gzip
import logging import logging
import paddle.fluid as fluid import paddle.fluid as fluid
import _init_paths
from data_utils.data import DataGenerator
from model_utils.model import DeepSpeech2Model
from utils.error_rate import char_errors, word_errors
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__) from training.cli import default_argument_parser
add_arg = functools.partial(add_arguments, argparser=parser) from model_utils.config import get_cfg_defaults
# yapf: disable
add_arg('num_batches', int, -1, "# of batches tuning on. "
"Default -1, on whole dev set.")
add_arg('batch_size', int, 256, "# of samples per batch.")
add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
add_arg('beam_size', int, 500, "Beam search width.") from data_utils.dataset import SpeechCollator
add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.") from data_utils.dataset import DeepSpeech2Dataset
add_arg('num_alphas', int, 45, "# of alpha candidates for tuning.") from data_utils.dataset import DeepSpeech2DistributedBatchSampler
add_arg('num_betas', int, 8, "# of beta candidates for tuning.") from data_utils.dataset import DeepSpeech2BatchSampler
add_arg('alpha_from', float, 1.0, "Where alpha starts tuning from.") from paddle.io import DataLoader
add_arg('alpha_to', float, 3.2, "Where alpha ends tuning with.")
add_arg('beta_from', float, 0.1, "Where beta starts tuning from.")
add_arg('beta_to', float, 0.45, "Where beta ends tuning with.")
add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.")
add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.")
add_arg('num_conv_layers', int, 2, "# of convolution layers.") from model_utils.network import DeepSpeech2
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") from model_utils.network import DeepSpeech2Loss
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") from utils.error_rate import char_errors, word_errors
add_arg('use_gpu', bool, True, "Use GPU or not.") from utils.utility import add_arguments, print_arguments
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"bi-directional RNNs. Not for GRU.")
add_arg('tune_manifest', str,
'data/librispeech/manifest.dev-clean',
"Filepath of manifest to tune.")
add_arg('mean_std_path', str,
'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'data/librispeech/vocab.txt',
"Filepath of vocabulary.")
add_arg('lang_model_path', str,
'models/lm/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.")
add_arg('model_path', str,
'./checkpoints/libri/params.latest.tar.gz',
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.")
add_arg('error_rate_type', str,
'wer',
"Error rate type for evaluation.",
choices=['wer', 'cer'])
add_arg('specgram_type', str,
'linear',
"Audio feature type. Options: linear, mfcc.",
choices=['linear', 'mfcc'])
# yapf: disable
args = parser.parse_args()
def tune(): def tune(config, args):
"""Tune parameters alpha and beta incrementally.""" """Tune parameters alpha and beta incrementally."""
if not args.num_alphas >= 0: if not args.num_alphas >= 0:
raise ValueError("num_alphas must be non-negative!") raise ValueError("num_alphas must be non-negative!")
if not args.num_betas >= 0: if not args.num_betas >= 0:
raise ValueError("num_betas must be non-negative!") raise ValueError("num_betas must be non-negative!")
if args.use_gpu: dev_dataset = DeepSpeech2Dataset(
place = fluid.CUDAPlace(0) config.data.dev_manifest,
else: config.data.vocab_filepath,
place = fluid.CPUPlace() config.data.mean_std_filepath,
augmentation_config="{}",
data_generator = DataGenerator( max_duration=config.data.max_duration,
vocab_filepath=args.vocab_path, min_duration=config.data.min_duration,
mean_std_filepath=args.mean_std_path, stride_ms=config.data.stride_ms,
augmentation_config='{}', window_ms=config.data.window_ms,
specgram_type=args.specgram_type, n_fft=config.data.n_fft,
keep_transcription_text=True, max_freq=config.data.max_freq,
place = place, target_sample_rate=config.data.target_sample_rate,
is_training = False) specgram_type=config.data.specgram_type,
use_dB_normalization=config.data.use_dB_normalization,
batch_reader = data_generator.batch_reader_creator( target_dB=config.data.target_dB,
manifest_path=args.tune_manifest, random_seed=config.data.random_seed,
batch_size=args.batch_size, keep_transcription_text=True)
sortagrad=False,
shuffle_method=None) valid_loader = DataLoader(
dev_dataset,
ds2_model = DeepSpeech2Model( batch_size=config.data.batch_size,
vocab_size=data_generator.vocab_size, shuffle=False,
num_conv_layers=args.num_conv_layers, drop_last=False,
num_rnn_layers=args.num_rnn_layers, collate_fn=SpeechCollator(is_training=False))
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, model = DeepSpeech2(
place=place, feat_size=valid_loader.dataset.feature_size,
init_from_pretrained_model=args.model_path, dict_size=valid_loader.dataset.vocab_size,
share_rnn_weights=args.share_rnn_weights) num_conv_layers=config.model.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers,
rnn_size=config.model.rnn_layer_size,
share_rnn_weights=config.model.share_rnn_weights)
model.from_pretrained(args.checkpoint_path)
model.eval()
# decoders only accept string encoded in utf-8 # decoders only accept string encoded in utf-8
vocab_list = [chars for chars in data_generator.vocab_list] vocab_list = valid_loader.dataset.vocab_list
errors_func = char_errors if args.error_rate_type == 'cer' else word_errors errors_func = char_errors if config.decoding.error_rate_type == 'cer' else word_errors
# create grid for search # create grid for search
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
@ -131,34 +93,42 @@ def tune():
err_sum = [0.0 for i in range(len(params_grid))] err_sum = [0.0 for i in range(len(params_grid))]
err_ave = [0.0 for i in range(len(params_grid))] err_ave = [0.0 for i in range(len(params_grid))]
num_ins, len_refs, cur_batch = 0, 0, 0 num_ins, len_refs, cur_batch = 0, 0, 0
# initialize external scorer # initialize external scorer
ds2_model.init_ext_scorer(args.alpha_from, args.beta_from, model.init_decode(args.alpha_from, args.beta_from,
args.lang_model_path, vocab_list) config.decoding.lang_model_path, vocab_list,
config.decoding.decoding_method)
## incremental tuning parameters over multiple batches ## incremental tuning parameters over multiple batches
ds2_model.logger.info("start tuning ...") print("start tuning ...")
for infer_data in batch_reader(): for infer_data in valid_loader():
if (args.num_batches >= 0) and (cur_batch >= args.num_batches): if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
break break
probs_split = ds2_model.infer_batch_probs(
infer_data=infer_data,
feeding_dict=data_generator.feeding)
target_transcripts = infer_data[1]
num_ins += len(target_transcripts) def ordid2token(texts, texts_len):
""" ord() id to chr() chr """
trans = []
for text, n in zip(texts, texts_len):
n = n.numpy().item()
ids = text[:n]
trans.append(''.join([chr(i) for i in ids]))
return trans
audio, text, audio_len, text_len = infer_data
_, probs, _ = model.predict(audio, audio_len)
target_transcripts = ordid2token(text, text_len)
num_ins += audio.shape[0]
# grid search # grid search
for index, (alpha, beta) in enumerate(params_grid): for index, (alpha, beta) in enumerate(params_grid):
result_transcripts = ds2_model.decode_batch_beam_search( print(f"tuneing: alpha={alpha} beta={beta}")
probs_split=probs_split, result_transcripts = model.decode_probs(
beam_alpha=alpha, probs.numpy(), vocab_list, config.decoding.decoding_method,
beam_beta=beta, config.decoding.lang_model_path, alpha, beta,
beam_size=args.beam_size, config.decoding.beam_size, config.decoding.cutoff_prob,
cutoff_prob=args.cutoff_prob, config.decoding.cutoff_top_n, config.decoding.num_proc_bsearch)
cutoff_top_n=args.cutoff_top_n,
vocab_list=vocab_list,
num_processes=args.num_proc_bsearch)
for target, result in zip(target_transcripts, result_transcripts): for target, result in zip(target_transcripts, result_transcripts):
#print(f"tuneing: {target} {result}")
errors, len_ref = errors_func(target, result) errors, len_ref = errors_func(target, result)
err_sum[index] += errors err_sum[index] += errors
@ -171,37 +141,80 @@ def tune():
if index % 2 == 0: if index % 2 == 0:
sys.stdout.write('.') sys.stdout.write('.')
sys.stdout.flush() sys.stdout.flush()
print(f"tuneing: one grid done!")
# output on-line tuning result at the end of current batch # output on-line tuning result at the end of current batch
err_ave_min = min(err_ave) err_ave_min = min(err_ave)
min_index = err_ave.index(err_ave_min) min_index = err_ave.index(err_ave_min)
print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), " print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), "
" min [%s] = %f" %(cur_batch, num_ins, " min [%s] = %f" %
"%.3f" % params_grid[min_index][0], (cur_batch, num_ins, "%.3f" % params_grid[min_index][0], "%.3f" %
"%.3f" % params_grid[min_index][1], params_grid[min_index][1], args.error_rate_type, err_ave_min))
args.error_rate_type, err_ave_min))
cur_batch += 1 cur_batch += 1
# output WER/CER at every (alpha, beta) # output WER/CER at every (alpha, beta)
print("\nFinal %s:\n" % args.error_rate_type) print("\nFinal %s:\n" % config.decoding.error_rate_type)
for index in range(len(params_grid)): for index in range(len(params_grid)):
print("(alpha, beta) = (%s, %s), [%s] = %f" print("(alpha, beta) = (%s, %s), [%s] = %f" %
% ("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1], ("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1],
args.error_rate_type, err_ave[index])) config.decoding.error_rate_type, err_ave[index]))
err_ave_min = min(err_ave) err_ave_min = min(err_ave)
min_index = err_ave.index(err_ave_min) min_index = err_ave.index(err_ave_min)
print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" %
% (cur_batch, "%.3f" % params_grid[min_index][0], (cur_batch, "%.3f" % params_grid[min_index][0],
"%.3f" % params_grid[min_index][1])) "%.3f" % params_grid[min_index][1]))
ds2_model.logger.info("finish tuning") ds2_model.logger.info("finish tuning")
def main(): def main_sp(config, args):
tune(config, args)
def main(config, args):
main_sp(config, args)
if __name__ == "__main__":
parser = default_argument_parser()
add_arg = functools.partial(add_arguments, argparser=parser)
add_arg('num_batches', int, -1, "# of batches tuning on. "
"Default -1, on whole dev set.")
add_arg('num_alphas', int, 45, "# of alpha candidates for tuning.")
add_arg('num_betas', int, 8, "# of beta candidates for tuning.")
add_arg('alpha_from', float, 1.0, "Where alpha starts tuning from.")
add_arg('alpha_to', float, 3.2, "Where alpha ends tuning with.")
add_arg('beta_from', float, 0.1, "Where beta starts tuning from.")
add_arg('beta_to', float, 0.45, "Where beta ends tuning with.")
add_arg('batch_size', int, 256, "# of samples per batch.")
add_arg('beam_size', int, 500, "Beam search width.")
add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.")
add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.")
add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.")
args = parser.parse_args()
print_arguments(args) print_arguments(args)
tune()
# https://yaml.org/type/float.html
config = get_cfg_defaults()
if args.config:
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.data.batch_size = args.batch_size
config.decoding.beam_size = args.beam_size
config.decoding.num_proc_bsearch = args.num_proc_bsearch
config.decoding.cutoff_prob = args.cutoff_prob
config.decoding.cutoff_top_n = args.cutoff_top_n
config.freeze()
print(config)
if args.dump_config:
with open(args.dump_config, 'w') as f:
print(config, file=f)
if __name__ == '__main__': main(config, args)
main()

Loading…
Cancel
Save