diff --git a/.gitignore b/.gitignore index dde3895fc..93b7544a4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,7 @@ .DS_Store *.pyc +tools/venv +.vscode +*.log +*.pdmodel +*.pdiparams* diff --git a/.notebook/dataloader.ipynb b/.notebook/dataloader.ipynb new file mode 100644 index 000000000..e2b8b3a0a --- /dev/null +++ b/.notebook/dataloader.ipynb @@ -0,0 +1,389 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "emerging-meter", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " def convert_to_list(value, n, name, dtype=np.int):\n", + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n", + " from numpy.dual import register_func\n", + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n", + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " long_ = _make_signed(np.long)\n", + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " ulong = _make_unsigned(np.long)\n" + ] + } + ], + "source": [ + "import math\n", + "import random\n", + "import tarfile\n", + "import logging\n", + "import numpy as np\n", + "from collections import namedtuple\n", + "from functools import partial\n", + "\n", + "import paddle\n", + "from paddle.io import Dataset\n", + "from paddle.io import DataLoader\n", + "from paddle.io import BatchSampler\n", + "from paddle.io import DistributedBatchSampler\n", + "from paddle import distributed as dist\n", + "\n", + "from data_utils.utility import read_manifest\n", + "from data_utils.augmentor.augmentation import AugmentationPipeline\n", + "from data_utils.featurizer.speech_featurizer import SpeechFeaturizer\n", + "from data_utils.speech import SpeechSegment\n", + "from data_utils.normalizer import FeatureNormalizer\n", + "\n", + "\n", + "from data_utils.dataset import (\n", + " DeepSpeech2Dataset,\n", + " DeepSpeech2DistributedBatchSampler,\n", + " DeepSpeech2BatchSampler,\n", + " SpeechCollator,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "excessive-american", + "metadata": {}, + "outputs": [], + "source": [ + "def create_dataloader(manifest_path,\t\n", + " vocab_filepath,\t\n", + " mean_std_filepath,\t\n", + " augmentation_config='{}',\t\n", + " max_duration=float('inf'),\t\n", + " min_duration=0.0,\t\n", + " stride_ms=10.0,\t\n", + " window_ms=20.0,\t\n", + " max_freq=None,\t\n", + " specgram_type='linear',\t\n", + " use_dB_normalization=True,\t\n", + " random_seed=0,\t\n", + " keep_transcription_text=False,\t\n", + " is_training=False,\t\n", + " batch_size=1,\t\n", + " num_workers=0,\t\n", + " sortagrad=False,\t\n", + " shuffle_method=None,\t\n", + " dist=False):\t\n", + "\n", + " dataset = DeepSpeech2Dataset(\t\n", + " manifest_path,\t\n", + " vocab_filepath,\t\n", + " mean_std_filepath,\t\n", + " augmentation_config=augmentation_config,\t\n", + " max_duration=max_duration,\t\n", + " min_duration=min_duration,\t\n", + " stride_ms=stride_ms,\t\n", + " window_ms=window_ms,\t\n", + " max_freq=max_freq,\t\n", + " specgram_type=specgram_type,\t\n", + " use_dB_normalization=use_dB_normalization,\t\n", + " random_seed=random_seed,\t\n", + " keep_transcription_text=keep_transcription_text)\t\n", + "\n", + " if dist:\t\n", + " batch_sampler = DeepSpeech2DistributedBatchSampler(\t\n", + " dataset,\t\n", + " batch_size,\t\n", + " num_replicas=None,\t\n", + " rank=None,\t\n", + " shuffle=is_training,\t\n", + " drop_last=is_training,\t\n", + " sortagrad=is_training,\t\n", + " shuffle_method=shuffle_method)\t\n", + " else:\t\n", + " batch_sampler = DeepSpeech2BatchSampler(\t\n", + " dataset,\t\n", + " shuffle=is_training,\t\n", + " batch_size=batch_size,\t\n", + " drop_last=is_training,\t\n", + " sortagrad=is_training,\t\n", + " shuffle_method=shuffle_method)\t\n", + "\n", + " def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):\t\n", + " \"\"\"\t\n", + " Padding audio features with zeros to make them have the same shape (or\t\n", + " a user-defined shape) within one bach.\t\n", + "\n", + " If ``padding_to`` is -1, the maximun shape in the batch will be used\t\n", + " as the target shape for padding. Otherwise, `padding_to` will be the\t\n", + " target shape (only refers to the second axis).\t\n", + "\n", + " If `flatten` is True, features will be flatten to 1darray.\t\n", + " \"\"\"\t\n", + " new_batch = []\t\n", + " # get target shape\t\n", + " max_length = max([audio.shape[1] for audio, text in batch])\t\n", + " if padding_to != -1:\t\n", + " if padding_to < max_length:\t\n", + " raise ValueError(\"If padding_to is not -1, it should be larger \"\t\n", + " \"than any instance's shape in the batch\")\t\n", + " max_length = padding_to\t\n", + " max_text_length = max([len(text) for audio, text in batch])\t\n", + " # padding\t\n", + " padded_audios = []\t\n", + " audio_lens = []\t\n", + " texts, text_lens = [], []\t\n", + " for audio, text in batch:\t\n", + " padded_audio = np.zeros([audio.shape[0], max_length])\t\n", + " padded_audio[:, :audio.shape[1]] = audio\t\n", + " if flatten:\t\n", + " padded_audio = padded_audio.flatten()\t\n", + " padded_audios.append(padded_audio)\t\n", + " audio_lens.append(audio.shape[1])\t\n", + "\n", + " padded_text = np.zeros([max_text_length])\n", + " if is_training:\n", + " padded_text[:len(text)] = text\t# ids\n", + " else:\n", + " padded_text[:len(text)] = [ord(t) for t in text] # string\n", + " \n", + " texts.append(padded_text)\t\n", + " text_lens.append(len(text))\t\n", + "\n", + " padded_audios = np.array(padded_audios).astype('float32')\t\n", + " audio_lens = np.array(audio_lens).astype('int64')\t\n", + " texts = np.array(texts).astype('int32')\t\n", + " text_lens = np.array(text_lens).astype('int64')\t\n", + " return padded_audios, texts, audio_lens, text_lens\t\n", + "\n", + " loader = DataLoader(\t\n", + " dataset,\t\n", + " batch_sampler=batch_sampler,\t\n", + " collate_fn=partial(padding_batch, is_training=is_training),\t\n", + " num_workers=num_workers)\t\n", + " return loader" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "naval-brave", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'infer_manifest': 'examples/aishell/data/manifest.dev', 'mean_std_path': 'examples/aishell/data/mean_std.npz', 'vocab_path': 'examples/aishell/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/aishell/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'linear'}\n" + ] + } + ], + "source": [ + "import sys\n", + "import argparse\n", + "import functools\n", + "from utils.utility import add_arguments, print_arguments\n", + "parser = argparse.ArgumentParser(description=__doc__)\n", + "add_arg = functools.partial(add_arguments, argparser=parser)\n", + "# yapf: disable\n", + "add_arg('num_samples', int, 5, \"# of samples to infer.\")\n", + "add_arg('beam_size', int, 500, \"Beam search width.\")\n", + "add_arg('num_proc_bsearch', int, 8, \"# of CPUs for beam search.\")\n", + "add_arg('num_conv_layers', int, 2, \"# of convolution layers.\")\n", + "add_arg('num_rnn_layers', int, 3, \"# of recurrent layers.\")\n", + "add_arg('rnn_layer_size', int, 2048, \"# of recurrent cells per layer.\")\n", + "add_arg('alpha', float, 2.5, \"Coef of LM for beam search.\")\n", + "add_arg('beta', float, 0.3, \"Coef of WC for beam search.\")\n", + "add_arg('cutoff_prob', float, 1.0, \"Cutoff probability for pruning.\")\n", + "add_arg('cutoff_top_n', int, 40, \"Cutoff number for pruning.\")\n", + "add_arg('use_gru', bool, False, \"Use GRUs instead of simple RNNs.\")\n", + "add_arg('use_gpu', bool, True, \"Use GPU or not.\")\n", + "add_arg('share_rnn_weights',bool, True, \"Share input-hidden weights across \"\n", + " \"bi-directional RNNs. Not for GRU.\")\n", + "add_arg('infer_manifest', str,\n", + " 'examples/aishell/data/manifest.dev',\n", + " \"Filepath of manifest to infer.\")\n", + "add_arg('mean_std_path', str,\n", + " 'examples/aishell/data/mean_std.npz',\n", + " \"Filepath of normalizer's mean & std.\")\n", + "add_arg('vocab_path', str,\n", + " 'examples/aishell/data/vocab.txt',\n", + " \"Filepath of vocabulary.\")\n", + "add_arg('lang_model_path', str,\n", + " 'models/lm/common_crawl_00.prune01111.trie.klm',\n", + " \"Filepath for language model.\")\n", + "add_arg('model_path', str,\n", + " 'examples/aishell/checkpoints/step_final',\n", + " \"If None, the training starts from scratch, \"\n", + " \"otherwise, it resumes from the pre-trained model.\")\n", + "add_arg('decoding_method', str,\n", + " 'ctc_beam_search',\n", + " \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n", + " choices = ['ctc_beam_search', 'ctc_greedy'])\n", + "add_arg('error_rate_type', str,\n", + " 'wer',\n", + " \"Error rate type for evaluation.\",\n", + " choices=['wer', 'cer'])\n", + "add_arg('specgram_type', str,\n", + " 'linear',\n", + " \"Audio feature type. Options: linear, mfcc.\",\n", + " choices=['linear', 'mfcc'])\n", + "# yapf: disable\n", + "args = parser.parse_args([])\n", + "print(vars(args))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "bearing-physics", + "metadata": {}, + "outputs": [], + "source": [ + "batch_reader = create_dataloader(\n", + " manifest_path=args.infer_manifest,\n", + " vocab_filepath=args.vocab_path,\n", + " mean_std_filepath=args.mean_std_path,\n", + " augmentation_config='{}',\n", + " #max_duration=float('inf'),\n", + " max_duration=27.0,\n", + " min_duration=0.0,\n", + " stride_ms=10.0,\n", + " window_ms=20.0,\n", + " max_freq=None,\n", + " specgram_type=args.specgram_type,\n", + " use_dB_normalization=True,\n", + " random_seed=0,\n", + " keep_transcription_text=True,\n", + " is_training=False,\n", + " batch_size=args.num_samples,\n", + " sortagrad=True,\n", + " shuffle_method=None,\n", + " dist=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "classified-melissa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "test Tensor(shape=[5, 6], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [[22823, 26102, 20195, 37324, 0 , 0 ],\n", + " [22238, 26469, 23601, 22909, 0 , 0 ],\n", + " [20108, 26376, 22235, 26085, 0 , 0 ],\n", + " [36824, 35201, 20445, 25345, 32654, 24863],\n", + " [29042, 27748, 21463, 23456, 0 , 0 ]])\n", + "test raw 大时代里\n", + "test raw 煲汤受宠\n", + "audio len Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [163, 167, 180, 186, 186])\n", + "test len Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n", + " [4, 4, 4, 6, 4])\n", + "audio Tensor(shape=[5, 161, 186], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [[[ 1.11669052, 0.79015088, 0.93658292, ..., 0. , 0. , 0. ],\n", + " [ 0.83549136, 0.72643483, 0.83578080, ..., 0. , 0. , 0. ],\n", + " [-0.89155018, -0.18894747, -0.53357804, ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [ 0.33386710, -0.81240511, 0.12869737, ..., 0. , 0. , 0. ],\n", + " [-0.17537928, 0.58380985, 0.70696265, ..., 0. , 0. , 0. ],\n", + " [-0.84175998, 1.22041416, 0.07929770, ..., 0. , 0. , 0. ]],\n", + "\n", + " [[-0.35964420, 0.77392709, 0.71409988, ..., 0. , 0. , 0. ],\n", + " [-0.15990183, 0.42962283, 0.06222462, ..., 0. , 0. , 0. ],\n", + " [-0.31166190, -0.74864638, -0.52836996, ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [-0.27546275, 0.32889456, 0.12410031, ..., 0. , 0. , 0. ],\n", + " [ 0.16264282, 0.49418071, -0.15960945, ..., 0. , 0. , 0. ],\n", + " [ 0.12476666, 0.00516864, 1.16021466, ..., 0. , 0. , 0. ]],\n", + "\n", + " [[ 0.90202141, 1.48541915, 0.92062062, ..., 0. , 0. , 0. ],\n", + " [ 0.82661545, 1.37171340, 0.86746097, ..., 0. , 0. , 0. ],\n", + " [-0.62287915, -0.48645937, 0.35041964, ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [ 0.07376949, 0.07138316, 0.76355994, ..., 0. , 0. , 0. ],\n", + " [-0.32306790, 0.43247896, 1.27311838, ..., 0. , 0. , 0. ],\n", + " [-0.97667056, 0.60747612, 0.79181534, ..., 0. , 0. , 0. ]],\n", + "\n", + " [[ 0.72022128, 0.95428467, 0.92766261, ..., 0.29105374, -0.45564806, -0.62151009],\n", + " [ 0.42083180, 0.49279949, 0.82724041, ..., -0.17333922, -1.45363355, -0.61673522],\n", + " [-0.76116520, -0.84750438, -0.09512503, ..., -1.01497340, -1.42781055, -0.80859023],\n", + " ...,\n", + " [-0.23009977, 1.06155431, 1.09065628, ..., 0.25581080, 0.53794998, -1.22650719],\n", + " [-1.37693381, 0.30778193, 0.17152318, ..., 0.51650339, 0.25580606, 0.83097816],\n", + " [-1.62180591, 1.30567718, 1.09928656, ..., -0.77590007, 1.27712476, 0.53189957]],\n", + "\n", + " [[ 1.03205252, -0.51535392, 0.21077573, ..., 0.76618457, 1.27425683, 1.52250278],\n", + " [ 0.82059991, 0.43990925, 0.13090958, ..., 0.86662549, 1.01687658, 1.48495352],\n", + " [-0.75489789, -0.01997089, -0.65174174, ..., 0.09061214, -0.55211234, -0.01614586],\n", + " ...,\n", + " [ 0.50985396, 1.84555030, 0.79185146, ..., 1.13666189, 1.19898069, 1.98158395],\n", + " [ 1.98721015, 2.52385354, 1.11714780, ..., 0.19416514, 1.11329341, 0.64460152],\n", + " [ 2.69512844, 1.90993905, 0.50245082, ..., -0.50902629, 0.03333465, -1.24584770]]])\n" + ] + } + ], + "source": [ + "for idx, (audio, text, audio_len, text_len) in enumerate(batch_reader()):\n", + " print('test', text)\n", + " print(\"test raw\", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))\n", + " print(\"test raw\", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))\n", + " print('audio len', audio_len)\n", + " print('test len', text_len)\n", + " print('audio', audio)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "unexpected-skating", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "minus-modern", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/.notebook/train_test.ipynb b/.notebook/train_test.ipynb new file mode 100644 index 000000000..bedad6e11 --- /dev/null +++ b/.notebook/train_test.ipynb @@ -0,0 +1,1887 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "cloudy-glass", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ['CUDA_VISISBLE_DEVICES'] = '0'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "grand-stephen", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " def convert_to_list(value, n, name, dtype=np.int):\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.0.0\n" + ] + } + ], + "source": [ + "import paddle\n", + "print(paddle.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "isolated-prize", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "romance-samuel", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'infer_manifest': 'examples/aishell/data/manifest.dev', 'mean_std_path': 'examples/aishell/data/mean_std.npz', 'vocab_path': 'examples/aishell/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/aishell/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'linear'}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n" + ] + } + ], + "source": [ + "import sys\n", + "import argparse\n", + "import functools\n", + "from utils.utility import add_arguments, print_arguments\n", + "parser = argparse.ArgumentParser(description=__doc__)\n", + "add_arg = functools.partial(add_arguments, argparser=parser)\n", + "# yapf: disable\n", + "add_arg('num_samples', int, 5, \"# of samples to infer.\")\n", + "add_arg('beam_size', int, 500, \"Beam search width.\")\n", + "add_arg('num_proc_bsearch', int, 8, \"# of CPUs for beam search.\")\n", + "add_arg('num_conv_layers', int, 2, \"# of convolution layers.\")\n", + "add_arg('num_rnn_layers', int, 3, \"# of recurrent layers.\")\n", + "add_arg('rnn_layer_size', int, 2048, \"# of recurrent cells per layer.\")\n", + "add_arg('alpha', float, 2.5, \"Coef of LM for beam search.\")\n", + "add_arg('beta', float, 0.3, \"Coef of WC for beam search.\")\n", + "add_arg('cutoff_prob', float, 1.0, \"Cutoff probability for pruning.\")\n", + "add_arg('cutoff_top_n', int, 40, \"Cutoff number for pruning.\")\n", + "add_arg('use_gru', bool, False, \"Use GRUs instead of simple RNNs.\")\n", + "add_arg('use_gpu', bool, True, \"Use GPU or not.\")\n", + "add_arg('share_rnn_weights',bool, True, \"Share input-hidden weights across \"\n", + " \"bi-directional RNNs. Not for GRU.\")\n", + "add_arg('infer_manifest', str,\n", + " 'examples/aishell/data/manifest.dev',\n", + " \"Filepath of manifest to infer.\")\n", + "add_arg('mean_std_path', str,\n", + " 'examples/aishell/data/mean_std.npz',\n", + " \"Filepath of normalizer's mean & std.\")\n", + "add_arg('vocab_path', str,\n", + " 'examples/aishell/data/vocab.txt',\n", + " \"Filepath of vocabulary.\")\n", + "add_arg('lang_model_path', str,\n", + " 'models/lm/common_crawl_00.prune01111.trie.klm',\n", + " \"Filepath for language model.\")\n", + "add_arg('model_path', str,\n", + " 'examples/aishell/checkpoints/step_final',\n", + " \"If None, the training starts from scratch, \"\n", + " \"otherwise, it resumes from the pre-trained model.\")\n", + "add_arg('decoding_method', str,\n", + " 'ctc_beam_search',\n", + " \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n", + " choices = ['ctc_beam_search', 'ctc_greedy'])\n", + "add_arg('error_rate_type', str,\n", + " 'wer',\n", + " \"Error rate type for evaluation.\",\n", + " choices=['wer', 'cer'])\n", + "add_arg('specgram_type', str,\n", + " 'linear',\n", + " \"Audio feature type. Options: linear, mfcc.\",\n", + " choices=['linear', 'mfcc'])\n", + "# yapf: disable\n", + "args = parser.parse_args([])\n", + "print(vars(args))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "timely-bikini", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n", + " from numpy.dual import register_func\n", + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n", + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " long_ = _make_signed(np.long)\n", + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " ulong = _make_unsigned(np.long)\n" + ] + } + ], + "source": [ + "from data_utils.dataset import create_dataloader\n", + "batch_reader = create_dataloader(\n", + " manifest_path=args.infer_manifest,\n", + " vocab_filepath=args.vocab_path,\n", + " mean_std_filepath=args.mean_std_path,\n", + " augmentation_config='{}',\n", + " #max_duration=float('inf'),\n", + " max_duration=27.0,\n", + " min_duration=0.0,\n", + " stride_ms=10.0,\n", + " window_ms=20.0,\n", + " max_freq=None,\n", + " specgram_type=args.specgram_type,\n", + " use_dB_normalization=True,\n", + " random_seed=0,\n", + " keep_transcription_text=False,\n", + " is_training=False,\n", + " batch_size=args.num_samples,\n", + " sortagrad=True,\n", + " shuffle_method=None,\n", + " dist=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "organized-warrior", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n", + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py:354: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. \n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " if arr.dtype == np.object:\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "test Tensor(shape=[5, 6], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [[14 , 34 , 322 , 233 , 0 , 0 ],\n", + " [238 , 38 , 122 , 164 , 0 , 0 ],\n", + " [8 , 52 , 49 , 42 , 0 , 0 ],\n", + " [109 , 47 , 146 , 193 , 210 , 479 ],\n", + " [3330, 1751, 208 , 1923, 0 , 0 ]])\n", + "test raw 大时代里的的\n", + "test raw 煲汤受宠的的\n", + "audio len Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [163, 167, 180, 186, 186])\n", + "test len Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [4, 4, 4, 6, 4])\n", + "audio Tensor(shape=[5, 161, 186], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [[[ 1.11669052, 0.79015088, 0.93658292, ..., 0. , 0. , 0. ],\n", + " [ 0.83549136, 0.72643483, 0.83578080, ..., 0. , 0. , 0. ],\n", + " [-0.89155018, -0.18894747, -0.53357804, ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [ 0.33386710, -0.81240511, 0.12869737, ..., 0. , 0. , 0. ],\n", + " [-0.17537928, 0.58380985, 0.70696265, ..., 0. , 0. , 0. ],\n", + " [-0.84175998, 1.22041416, 0.07929770, ..., 0. , 0. , 0. ]],\n", + "\n", + " [[-0.35964420, 0.77392709, 0.71409988, ..., 0. , 0. , 0. ],\n", + " [-0.15990183, 0.42962283, 0.06222462, ..., 0. , 0. , 0. ],\n", + " [-0.31166190, -0.74864638, -0.52836996, ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [-0.27546275, 0.32889456, 0.12410031, ..., 0. , 0. , 0. ],\n", + " [ 0.16264282, 0.49418071, -0.15960945, ..., 0. , 0. , 0. ],\n", + " [ 0.12476666, 0.00516864, 1.16021466, ..., 0. , 0. , 0. ]],\n", + "\n", + " [[ 0.90202141, 1.48541915, 0.92062062, ..., 0. , 0. , 0. ],\n", + " [ 0.82661545, 1.37171340, 0.86746097, ..., 0. , 0. , 0. ],\n", + " [-0.62287915, -0.48645937, 0.35041964, ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [ 0.07376949, 0.07138316, 0.76355994, ..., 0. , 0. , 0. ],\n", + " [-0.32306790, 0.43247896, 1.27311838, ..., 0. , 0. , 0. ],\n", + " [-0.97667056, 0.60747612, 0.79181534, ..., 0. , 0. , 0. ]],\n", + "\n", + " [[ 0.72022128, 0.95428467, 0.92766261, ..., 0.29105374, -0.45564806, -0.62151009],\n", + " [ 0.42083180, 0.49279949, 0.82724041, ..., -0.17333922, -1.45363355, -0.61673522],\n", + " [-0.76116520, -0.84750438, -0.09512503, ..., -1.01497340, -1.42781055, -0.80859023],\n", + " ...,\n", + " [-0.23009977, 1.06155431, 1.09065628, ..., 0.25581080, 0.53794998, -1.22650719],\n", + " [-1.37693381, 0.30778193, 0.17152318, ..., 0.51650339, 0.25580606, 0.83097816],\n", + " [-1.62180591, 1.30567718, 1.09928656, ..., -0.77590007, 1.27712476, 0.53189957]],\n", + "\n", + " [[ 1.03205252, -0.51535392, 0.21077573, ..., 0.76618457, 1.27425683, 1.52250278],\n", + " [ 0.82059991, 0.43990925, 0.13090958, ..., 0.86662549, 1.01687658, 1.48495352],\n", + " [-0.75489789, -0.01997089, -0.65174174, ..., 0.09061214, -0.55211234, -0.01614586],\n", + " ...,\n", + " [ 0.50985396, 1.84555030, 0.79185146, ..., 1.13666189, 1.19898069, 1.98158395],\n", + " [ 1.98721015, 2.52385354, 1.11714780, ..., 0.19416514, 1.11329341, 0.64460152],\n", + " [ 2.69512844, 1.90993905, 0.50245082, ..., -0.50902629, 0.03333465, -1.24584770]]])\n" + ] + } + ], + "source": [ + " for idx, (audio, text, audio_len, text_len) in enumerate(batch_reader()):\n", + " print('test', text)\n", + " print(\"test raw\", ''.join(batch_reader.dataset.vocab_list[i] for i in text[0]))\n", + " print(\"test raw\", ''.join(batch_reader.dataset.vocab_list[i] for i in text[-1]))\n", + " print('audio len', audio_len)\n", + " print('test len', text_len)\n", + " print('audio', audio)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "confidential-radius", + "metadata": {}, + "outputs": [], + "source": [ + "# reader = batch_reader()\n", + "# audio, test , audio_len, text_len = reader.next()\n", + "# print('test', text)\n", + "# print('t len', text_len) #[B, T]\n", + "# print('audio len', audio_len)\n", + "# print(audio)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "future-vermont", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "煲汤受宠\n" + ] + } + ], + "source": [ + "print(u'\\u7172\\u6c64\\u53d7\\u5ba0')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dental-sweden", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sunrise-contact", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hispanic-asthma", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hearing-leadership", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "skilled-friday", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "copyrighted-measure", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "employed-lightweight", + "metadata": {}, + "outputs": [], + "source": [ + "from model_utils.network import DeepSpeech2, DeepSpeech2Loss\n", + "\n", + "from data_utils.dataset import create_dataloader\n", + "batch_reader = create_dataloader(\n", + " manifest_path=args.infer_manifest,\n", + " vocab_filepath=args.vocab_path,\n", + " mean_std_filepath=args.mean_std_path,\n", + " augmentation_config='{}',\n", + " #max_duration=float('inf'),\n", + " max_duration=27.0,\n", + " min_duration=0.0,\n", + " stride_ms=10.0,\n", + " window_ms=20.0,\n", + " max_freq=None,\n", + " specgram_type=args.specgram_type,\n", + " use_dB_normalization=True,\n", + " random_seed=0,\n", + " keep_transcription_text=False,\n", + " is_training=False,\n", + " batch_size=args.num_samples,\n", + " sortagrad=True,\n", + " shuffle_method=None,\n", + " dist=False)\n", + "\n", + "\n", + "import paddle\n", + "from paddle import nn\n", + "from paddle.nn import functional as F\n", + "from paddle.nn import initializer as I\n", + "\n", + "import math\n", + "\n", + "def brelu(x, t_min=0.0, t_max=24.0, name=None):\n", + " t_min = paddle.to_tensor(t_min)\n", + " t_max = paddle.to_tensor(t_max)\n", + " return x.maximum(t_min).minimum(t_max)\n", + "\n", + "def sequence_mask(x_len, max_len=None, dtype='float32'):\n", + " max_len = max_len or x_len.max()\n", + " x_len = paddle.unsqueeze(x_len, -1)\n", + " row_vector = paddle.arange(max_len)\n", + " mask = row_vector > x_len # maybe a bug\n", + " mask = paddle.cast(mask, dtype)\n", + " print(f'seq mask: {mask}')\n", + " return mask\n", + "\n", + "\n", + "class ConvBn(nn.Layer):\n", + " def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,\n", + " padding, act):\n", + "\n", + " super().__init__()\n", + " self.kernel_size = kernel_size\n", + " self.stride = stride\n", + " self.padding = padding\n", + "\n", + " self.conv = nn.Conv2D(\n", + " num_channels_in,\n", + " num_channels_out,\n", + " kernel_size=kernel_size,\n", + " stride=stride,\n", + " padding=padding,\n", + " weight_attr=None,\n", + " bias_attr=None,\n", + " data_format='NCHW')\n", + "\n", + " self.bn = nn.BatchNorm2D(\n", + " num_channels_out,\n", + " weight_attr=None,\n", + " bias_attr=None,\n", + " data_format='NCHW')\n", + " self.act = F.relu if act == 'relu' else brelu\n", + "\n", + " def forward(self, x, x_len):\n", + " \"\"\"\n", + " x(Tensor): audio, shape [B, C, D, T]\n", + " \"\"\"\n", + " x = self.conv(x)\n", + " x = self.bn(x)\n", + " x = self.act(x)\n", + "\n", + " x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]\n", + " ) // self.stride[1] + 1\n", + "\n", + " # reset padding part to 0\n", + " masks = sequence_mask(x_len) #[B, T]\n", + " masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]\n", + " x = x.multiply(masks)\n", + "\n", + " return x, x_len\n", + "\n", + "\n", + "class ConvStack(nn.Layer):\n", + " def __init__(self, feat_size, num_stacks):\n", + " super().__init__()\n", + " self.feat_size = feat_size # D\n", + " self.num_stacks = num_stacks\n", + "\n", + " self.conv_in = ConvBn(\n", + " num_channels_in=1,\n", + " num_channels_out=32,\n", + " kernel_size=(41, 11), #[D, T]\n", + " stride=(2, 3),\n", + " padding=(20, 5),\n", + " act='brelu')\n", + "\n", + " out_channel = 32\n", + " self.conv_stack = nn.LayerList([\n", + " ConvBn(\n", + " num_channels_in=32,\n", + " num_channels_out=out_channel,\n", + " kernel_size=(21, 11),\n", + " stride=(2, 1),\n", + " padding=(10, 5),\n", + " act='brelu') for i in range(num_stacks - 1)\n", + " ])\n", + "\n", + " # conv output feat_dim\n", + " output_height = (feat_size - 1) // 2 + 1\n", + " for i in range(self.num_stacks - 1):\n", + " output_height = (output_height - 1) // 2 + 1\n", + " self.output_height = out_channel * output_height\n", + "\n", + " def forward(self, x, x_len):\n", + " \"\"\"\n", + " x: shape [B, C, D, T]\n", + " x_len : shape [B]\n", + " \"\"\"\n", + " print(f\"conv in: {x_len}\")\n", + " x, x_len = self.conv_in(x, x_len)\n", + " for i, conv in enumerate(self.conv_stack):\n", + " print(f\"conv in: {x_len}\")\n", + " x, x_len = conv(x, x_len)\n", + " print(f\"conv out: {x_len}\")\n", + " return x, x_len\n", + " \n", + " \n", + "\n", + "class RNNCell(nn.RNNCellBase):\n", + " r\"\"\"\n", + " Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it \n", + " computes the outputs and updates states.\n", + " The formula used is as follows:\n", + " .. math::\n", + " h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})\n", + " y_{t} & = h_{t}\n", + " \n", + " where :math:`act` is for :attr:`activation`.\n", + " \"\"\"\n", + "\n", + " def __init__(self,\n", + " hidden_size,\n", + " activation=\"tanh\",\n", + " weight_ih_attr=None,\n", + " weight_hh_attr=None,\n", + " bias_ih_attr=None,\n", + " bias_hh_attr=None,\n", + " name=None):\n", + " super().__init__()\n", + " std = 1.0 / math.sqrt(hidden_size)\n", + " self.weight_hh = self.create_parameter(\n", + " (hidden_size, hidden_size),\n", + " weight_hh_attr,\n", + " default_initializer=I.Uniform(-std, std))\n", + " # self.bias_ih = self.create_parameter(\n", + " # (hidden_size, ),\n", + " # bias_ih_attr,\n", + " # is_bias=True,\n", + " # default_initializer=I.Uniform(-std, std))\n", + " self.bias_ih = None\n", + " self.bias_hh = self.create_parameter(\n", + " (hidden_size, ),\n", + " bias_hh_attr,\n", + " is_bias=True,\n", + " default_initializer=I.Uniform(-std, std))\n", + "\n", + " self.hidden_size = hidden_size\n", + " if activation not in [\"tanh\", \"relu\", \"brelu\"]:\n", + " raise ValueError(\n", + " \"activation for SimpleRNNCell should be tanh or relu, \"\n", + " \"but get {}\".format(activation))\n", + " self.activation = activation\n", + " self._activation_fn = paddle.tanh \\\n", + " if activation == \"tanh\" \\\n", + " else F.relu\n", + " if activation == 'brelu':\n", + " self._activation_fn = brelu\n", + "\n", + " def forward(self, inputs, states=None):\n", + " if states is None:\n", + " states = self.get_initial_states(inputs, self.state_shape)\n", + " pre_h = states\n", + " i2h = inputs\n", + " if self.bias_ih is not None:\n", + " i2h += self.bias_ih\n", + " h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)\n", + " if self.bias_hh is not None:\n", + " h2h += self.bias_hh\n", + " h = self._activation_fn(i2h + h2h)\n", + " return h, h\n", + "\n", + " @property\n", + " def state_shape(self):\n", + " return (self.hidden_size, )\n", + "\n", + "\n", + "class GRUCellShare(nn.RNNCellBase):\n", + " r\"\"\"\n", + " Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, \n", + " it computes the outputs and updates states.\n", + " The formula for GRU used is as follows:\n", + " .. math::\n", + " r_{t} & = \\sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})\n", + " z_{t} & = \\sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})\n", + " \\widetilde{h}_{t} & = \\tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))\n", + " h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \\widetilde{h}_{t}\n", + " y_{t} & = h_{t}\n", + " \n", + " where :math:`\\sigma` is the sigmoid fucntion, and * is the elemetwise \n", + " multiplication operator.\n", + " \"\"\"\n", + "\n", + " def __init__(self,\n", + " input_size,\n", + " hidden_size,\n", + " weight_ih_attr=None,\n", + " weight_hh_attr=None,\n", + " bias_ih_attr=None,\n", + " bias_hh_attr=None,\n", + " name=None):\n", + " super().__init__()\n", + " std = 1.0 / math.sqrt(hidden_size)\n", + " self.weight_hh = self.create_parameter(\n", + " (3 * hidden_size, hidden_size),\n", + " weight_hh_attr,\n", + " default_initializer=I.Uniform(-std, std))\n", + " # self.bias_ih = self.create_parameter(\n", + " # (3 * hidden_size, ),\n", + " # bias_ih_attr,\n", + " # is_bias=True,\n", + " # default_initializer=I.Uniform(-std, std))\n", + " self.bias_ih = None\n", + " self.bias_hh = self.create_parameter(\n", + " (3 * hidden_size, ),\n", + " bias_hh_attr,\n", + " is_bias=True,\n", + " default_initializer=I.Uniform(-std, std))\n", + "\n", + " self.hidden_size = hidden_size\n", + " self.input_size = input_size\n", + " self._gate_activation = F.sigmoid\n", + " #self._activation = paddle.tanh\n", + " self._activation = F.relu\n", + "\n", + " def forward(self, inputs, states=None):\n", + " if states is None:\n", + " states = self.get_initial_states(inputs, self.state_shape)\n", + "\n", + " pre_hidden = states\n", + " x_gates = inputs\n", + " if self.bias_ih is not None:\n", + " x_gates = x_gates + self.bias_ih\n", + " h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)\n", + " if self.bias_hh is not None:\n", + " h_gates = h_gates + self.bias_hh\n", + "\n", + " x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)\n", + " h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)\n", + "\n", + " r = self._gate_activation(x_r + h_r)\n", + " z = self._gate_activation(x_z + h_z)\n", + " c = self._activation(x_c + r * h_c) # apply reset gate after mm\n", + " h = (pre_hidden - c) * z + c\n", + " # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru\n", + " #h = (1-z) * pre_hidden + z * c\n", + "\n", + " return h, h\n", + "\n", + " @property\n", + " def state_shape(self):\n", + " r\"\"\"\n", + " The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch\n", + " size would be automatically inserted into shape). The shape corresponds\n", + " to the shape of :math:`h_{t-1}`.\n", + " \"\"\"\n", + " return (self.hidden_size, )\n", + "\n", + "\n", + "class BiRNNWithBN(nn.Layer):\n", + " \"\"\"Bidirectonal simple rnn layer with sequence-wise batch normalization.\n", + " The batch normalization is only performed on input-state weights.\n", + "\n", + " :param name: Name of the layer parameters.\n", + " :type name: string\n", + " :param size: Dimension of RNN cells.\n", + " :type size: int\n", + " :param share_weights: Whether to share input-hidden weights between\n", + " forward and backward directional RNNs.\n", + " :type share_weights: bool\n", + " :return: Bidirectional simple rnn layer.\n", + " :rtype: Variable\n", + " \"\"\"\n", + "\n", + " def __init__(self, i_size, h_size, share_weights):\n", + " super().__init__()\n", + " self.share_weights = share_weights\n", + " if self.share_weights:\n", + " #input-hidden weights shared between bi-directional rnn.\n", + " self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)\n", + " # batch norm is only performed on input-state projection\n", + " self.fw_bn = nn.BatchNorm1D(\n", + " h_size, bias_attr=None, data_format='NLC')\n", + " self.bw_fc = self.fw_fc\n", + " self.bw_bn = self.fw_bn\n", + " else:\n", + " self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)\n", + " self.fw_bn = nn.BatchNorm1D(\n", + " h_size, bias_attr=None, data_format='NLC')\n", + " self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)\n", + " self.bw_bn = nn.BatchNorm1D(\n", + " h_size, bias_attr=None, data_format='NLC')\n", + "\n", + " self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')\n", + " self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')\n", + " self.fw_rnn = nn.RNN(\n", + " self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]\n", + " self.bw_rnn = nn.RNN(\n", + " self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]\n", + "\n", + " def forward(self, x, x_len):\n", + " # x, shape [B, T, D]\n", + " fw_x = self.fw_bn(self.fw_fc(x))\n", + " bw_x = self.bw_bn(self.bw_fc(x))\n", + " fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)\n", + " bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)\n", + " x = paddle.concat([fw_x, bw_x], axis=-1)\n", + " return x, x_len\n", + "\n", + "\n", + "class BiGRUWithBN(nn.Layer):\n", + " \"\"\"Bidirectonal gru layer with sequence-wise batch normalization.\n", + " The batch normalization is only performed on input-state weights.\n", + "\n", + " :param name: Name of the layer.\n", + " :type name: string\n", + " :param input: Input layer.\n", + " :type input: Variable\n", + " :param size: Dimension of GRU cells.\n", + " :type size: int\n", + " :param act: Activation type.\n", + " :type act: string\n", + " :return: Bidirectional GRU layer.\n", + " :rtype: Variable\n", + " \"\"\"\n", + "\n", + " def __init__(self, i_size, h_size, act):\n", + " super().__init__()\n", + " hidden_size = h_size * 3\n", + " self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)\n", + " self.fw_bn = nn.BatchNorm1D(\n", + " hidden_size, bias_attr=None, data_format='NLC')\n", + " self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)\n", + " self.bw_bn = nn.BatchNorm1D(\n", + " hidden_size, bias_attr=None, data_format='NLC')\n", + "\n", + " self.fw_cell = GRUCellShare(input_size=hidden_size, hidden_size=h_size)\n", + " self.bw_cell = GRUCellShare(input_size=hidden_size, hidden_size=h_size)\n", + " self.fw_rnn = nn.RNN(\n", + " self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]\n", + " self.bw_rnn = nn.RNN(\n", + " self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]\n", + "\n", + " def forward(self, x, x_len):\n", + " # x, shape [B, T, D]\n", + " fw_x = self.fw_bn(self.fw_fc(x))\n", + " bw_x = self.bw_bn(self.bw_fc(x))\n", + " fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)\n", + " bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)\n", + " x = paddle.concat([fw_x, bw_x], axis=-1)\n", + " return x, x_len\n", + "\n", + "\n", + "class RNNStack(nn.Layer):\n", + " \"\"\"RNN group with stacked bidirectional simple RNN or GRU layers.\n", + "\n", + " :param input: Input layer.\n", + " :type input: Variable\n", + " :param size: Dimension of RNN cells in each layer.\n", + " :type size: int\n", + " :param num_stacks: Number of stacked rnn layers.\n", + " :type num_stacks: int\n", + " :param use_gru: Use gru if set True. Use simple rnn if set False.\n", + " :type use_gru: bool\n", + " :param share_rnn_weights: Whether to share input-hidden weights between\n", + " forward and backward directional RNNs.\n", + " It is only available when use_gru=False.\n", + " :type share_weights: bool\n", + " :return: Output layer of the RNN group.\n", + " :rtype: Variable\n", + " \"\"\"\n", + "\n", + " def __init__(self, i_size, h_size, num_stacks, use_gru, share_rnn_weights):\n", + " super().__init__()\n", + " self.rnn_stacks = nn.LayerList()\n", + " for i in range(num_stacks):\n", + " if use_gru:\n", + " #default:GRU using tanh\n", + " self.rnn_stacks.append(\n", + " BiGRUWithBN(i_size=i_size, h_size=h_size, act=\"relu\"))\n", + " else:\n", + " self.rnn_stacks.append(\n", + " BiRNNWithBN(\n", + " i_size=i_size,\n", + " h_size=h_size,\n", + " share_weights=share_rnn_weights))\n", + " i_size = h_size * 2\n", + "\n", + " def forward(self, x, x_len):\n", + " \"\"\"\n", + " x: shape [B, T, D]\n", + " x_len: shpae [B]\n", + " \"\"\"\n", + " for i, rnn in enumerate(self.rnn_stacks):\n", + " x, x_len = rnn(x, x_len)\n", + " masks = sequence_mask(x_len) #[B, T]\n", + " masks = masks.unsqueeze(-1) # [B, T, 1]\n", + " x = x.multiply(masks)\n", + " return x, x_len\n", + "\n", + " \n", + "class DeepSpeech2Test(DeepSpeech2):\n", + " def __init__(self,\n", + " feat_size,\n", + " dict_size,\n", + " num_conv_layers=2,\n", + " num_rnn_layers=3,\n", + " rnn_size=256,\n", + " use_gru=False,\n", + " share_rnn_weights=True):\n", + " super().__init__(feat_size,\n", + " dict_size,\n", + " num_conv_layers=2,\n", + " num_rnn_layers=3,\n", + " rnn_size=256,\n", + " use_gru=False,\n", + " share_rnn_weights=True)\n", + " self.feat_size = feat_size # 161 for linear\n", + " self.dict_size = dict_size\n", + "\n", + " self.conv = ConvStack(feat_size, num_conv_layers)\n", + " \n", + "# self.fc = nn.Linear(1312, dict_size + 1)\n", + "\n", + " i_size = self.conv.output_height # H after conv stack\n", + " self.rnn = RNNStack(\n", + " i_size=i_size,\n", + " h_size=rnn_size,\n", + " num_stacks=num_rnn_layers,\n", + " use_gru=use_gru,\n", + " share_rnn_weights=share_rnn_weights)\n", + " \n", + " self.fc = nn.Linear(rnn_size * 2, dict_size + 1)\n", + " \n", + " def infer(self, audio, audio_len):\n", + " # [B, D, T] -> [B, C=1, D, T]\n", + " audio = audio.unsqueeze(1)\n", + "\n", + " # convolution group\n", + " x, audio_len = self.conv(audio, audio_len)\n", + " print('conv out', x.shape)\n", + "\n", + " # convert data from convolution feature map to sequence of vectors\n", + " B, C, D, T = paddle.shape(x)\n", + " x = x.transpose([0, 3, 1, 2]) #[B, T, C, D]\n", + " x = x.reshape([B, T, C * D]) #[B, T, C*D]\n", + " print('rnn input', x.shape)\n", + "\n", + " # remove padding part\n", + " x, audio_len = self.rnn(x, audio_len) #[B, T, D]\n", + " print('rnn output', x.shape)\n", + "\n", + " logits = self.fc(x) #[B, T, V + 1]\n", + "\n", + " #ctcdecoder need probs, not log_probs\n", + " probs = F.softmax(logits)\n", + "\n", + " return logits, probs, audio_len\n", + "\n", + " def forward(self, audio, text, audio_len, text_len):\n", + " \"\"\"\n", + " audio: shape [B, D, T]\n", + " text: shape [B, T]\n", + " audio_len: shape [B]\n", + " text_len: shape [B]\n", + " \"\"\"\n", + " return self.infer(audio, audio_len)\n", + " \n", + "\n", + "feat_dim=161\n", + "\n", + "model = DeepSpeech2Test(\n", + " feat_size=feat_dim,\n", + " dict_size=batch_reader.dataset.vocab_size,\n", + " num_conv_layers=args.num_conv_layers,\n", + " num_rnn_layers=args.num_rnn_layers,\n", + " rnn_size=1024,\n", + " use_gru=args.use_gru,\n", + " share_rnn_weights=args.share_rnn_weights,\n", + " )\n", + "dp_model = model\n", + "#dp_model = paddle.DataParallel(model)\n", + "\n", + "loss_fn = DeepSpeech2Loss(batch_reader.dataset.vocab_size)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "divided-incentive", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "discrete-conjunction", + "metadata": {}, + "outputs": [], + "source": [ + "audio, text, audio_len, text_len = None, None, None, None\n", + "\n", + "for idx, inputs in enumerate(batch_reader):\n", + " audio, text, audio_len, text_len = inputs\n", + "# print(idx)\n", + "# print('a', audio.shape, audio.place)\n", + "# print('t', text)\n", + "# print('al', audio_len)\n", + "# print('tl', text_len)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "protected-announcement", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "conv in: Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [163, 167, 180, 186, 186])\n", + "seq mask: Tensor(shape=[5, 62], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", + " [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])\n", + "conv in: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n", + " [55, 56, 60, 62, 62])\n", + "seq mask: Tensor(shape=[5, 62], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", + " [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])\n", + "conv out: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n", + " [55, 56, 60, 62, 62])\n", + "conv out [5, 32, 41, 62]\n", + "rnn input [5, 62, 1312]\n", + "seq mask: Tensor(shape=[5, 62], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", + " [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])\n", + "seq mask: Tensor(shape=[5, 62], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", + " [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:77: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", + " return (isinstance(seq, collections.Sequence) and\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "seq mask: Tensor(shape=[5, 62], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", + " [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])\n", + "rnn output [5, 62, 2048]\n", + "logits len Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n", + " [55, 56, 60, 62, 62])\n", + "loss Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", + " [2316.82153320])\n" + ] + } + ], + "source": [ + "outputs = dp_model(audio, text, audio_len, text_len)\n", + "logits, _, logits_len = outputs\n", + "print('logits len', logits_len)\n", + "loss = loss_fn.forward(logits, text, logits_len, text_len)\n", + "print('loss', loss)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "universal-myrtle", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "param grad: conv.conv_in.conv.weight: shape: [32, 1, 41, 11] stop_grad: False grad: None\n", + "param grad: conv.conv_in.conv.bias: shape: [32] stop_grad: False grad: None\n", + "param grad: conv.conv_in.bn.weight: shape: [32] stop_grad: False grad: None\n", + "param grad: conv.conv_in.bn.bias: shape: [32] stop_grad: False grad: None\n", + "param grad: conv.conv_in.bn._mean: shape: [32] stop_grad: True grad: None\n", + "param grad: conv.conv_in.bn._variance: shape: [32] stop_grad: True grad: None\n", + "param grad: conv.conv_stack.0.conv.weight: shape: [32, 32, 21, 11] stop_grad: False grad: None\n", + "param grad: conv.conv_stack.0.conv.bias: shape: [32] stop_grad: False grad: None\n", + "param grad: conv.conv_stack.0.bn.weight: shape: [32] stop_grad: False grad: None\n", + "param grad: conv.conv_stack.0.bn.bias: shape: [32] stop_grad: False grad: None\n", + "param grad: conv.conv_stack.0.bn._mean: shape: [32] stop_grad: True grad: None\n", + "param grad: conv.conv_stack.0.bn._variance: shape: [32] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.0.fw_fc.weight: shape: [1312, 1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.0.fw_bn.weight: shape: [1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.0.fw_bn.bias: shape: [1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.0.fw_bn._mean: shape: [1024] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.0.fw_bn._variance: shape: [1024] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.0.fw_cell.weight_hh: shape: [1024, 1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.0.fw_cell.bias_hh: shape: [1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.0.bw_cell.weight_hh: shape: [1024, 1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.0.bw_cell.bias_hh: shape: [1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.1.fw_fc.weight: shape: [2048, 1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.1.fw_bn.weight: shape: [1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.1.fw_bn.bias: shape: [1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.1.fw_bn._mean: shape: [1024] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.1.fw_bn._variance: shape: [1024] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.1.fw_cell.weight_hh: shape: [1024, 1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.1.fw_cell.bias_hh: shape: [1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.1.bw_cell.weight_hh: shape: [1024, 1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.1.bw_cell.bias_hh: shape: [1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.2.fw_fc.weight: shape: [2048, 1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.2.fw_bn.weight: shape: [1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.2.fw_bn.bias: shape: [1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.2.fw_bn._mean: shape: [1024] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.2.fw_bn._variance: shape: [1024] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.2.fw_cell.weight_hh: shape: [1024, 1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.2.fw_cell.bias_hh: shape: [1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.2.bw_cell.weight_hh: shape: [1024, 1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.2.bw_cell.bias_hh: shape: [1024] stop_grad: False grad: None\n", + "param grad: fc.weight: shape: [2048, 4299] stop_grad: False grad: None\n", + "param grad: fc.bias: shape: [4299] stop_grad: False grad: None\n" + ] + } + ], + "source": [ + "for n, p in dp_model.named_parameters():\n", + " print(\n", + " f\"param grad: {n}: shape: {p.shape} stop_grad: {p.stop_gradient} grad: {p.grad}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "referenced-double", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "param grad: conv.conv_in.conv.weight: shape: [32, 1, 41, 11] stop_grad: False grad: [[[[ 2.1243238 1.696022 3.770659 ... 5.234652 5.4865217\n", + " 4.757795 ]\n", + " [ 2.651376 2.3109848 4.428488 ... 5.353201 8.703288\n", + " 5.1787405 ]\n", + " [ 2.7511077 1.8823049 2.1875212 ... 3.4821286 6.386543\n", + " 3.5026932 ]\n", + " ...\n", + " [ 1.9173846 1.8623551 0.5601456 ... 2.8375719 3.8496673\n", + " 2.359191 ]\n", + " [ 2.3827765 2.497965 1.5914664 ... 2.220721 3.4617734\n", + " 4.829253 ]\n", + " [ 1.6855702 1.5040786 1.8793598 ... 4.0773935 3.176893\n", + " 3.7477999 ]]]\n", + "\n", + "\n", + " [[[ 1.8451455 2.0091445 1.5225713 ... 1.524528 0.17764974\n", + " 1.0245132 ]\n", + " [ 1.9388857 1.3873467 2.044691 ... 0.92544 -0.9746763\n", + " -0.41603735]\n", + " [ 2.6814485 2.6096234 1.6802506 ... 1.902397 1.6837387\n", + " -0.96788657]\n", + " ...\n", + " [ 4.3675485 1.9822174 1.1695029 ... 1.4672399 3.2029557\n", + " 2.6364415 ]\n", + " [ 3.2536 1.1792442 -0.5618002 ... 2.101127 1.904225\n", + " 3.3839993 ]\n", + " [ 1.9118482 1.0651072 0.5409893 ... 2.6783593 1.6871439\n", + " 4.1078367 ]]]\n", + "\n", + "\n", + " [[[-4.412424 -1.7111907 -1.7722387 ... -4.3383503 -6.2393785\n", + " -6.139402 ]\n", + " [-2.260428 -1.0250616 -2.0550888 ... -5.353946 -4.29947\n", + " -6.158736 ]\n", + " [-1.4927872 0.7552787 -0.0702923 ... -4.485656 -4.0794134\n", + " -5.416684 ]\n", + " ...\n", + " [ 2.9100134 4.156195 4.357041 ... -3.569804 -1.8634341\n", + " -0.8772557 ]\n", + " [ 1.6895763 3.4314504 4.1192107 ... -1.380024 -2.3234155\n", + " -3.6650617 ]\n", + " [ 2.4190075 1.007498 3.1173465 ... -0.96318084 -3.6175003\n", + " -2.5240796 ]]]\n", + "\n", + "\n", + " ...\n", + "\n", + "\n", + " [[[-0.6865506 -0.60106415 -1.5555015 ... 2.0853553 1.900961\n", + " 2.101063 ]\n", + " [-0.31686288 -1.4362946 -1.4929098 ... 0.15085456 1.4540495\n", + " 1.4128599 ]\n", + " [-0.57852304 -0.8204216 -2.3264258 ... 1.4970423 0.54599845\n", + " 1.6222539 ]\n", + " ...\n", + " [ 0.32624918 0.96004546 -0.7476514 ... 2.2786083 2.1000178\n", + " 2.7494807 ]\n", + " [-1.6967826 -0.78979015 -1.8424999 ... 1.0620685 2.0544293\n", + " 2.2483966 ]\n", + " [ 0.8192332 2.601636 -2.6636481 ... 0.26625186 1.7610842\n", + " 1.7467536 ]]]\n", + "\n", + "\n", + " [[[ 0.9140297 0.42424175 1.4352363 ... -2.3022954 -3.001058\n", + " -2.6987422 ]\n", + " [ 0.4491998 -0.10698095 1.5089144 ... -3.2831016 -3.6055021\n", + " -3.6595795 ]\n", + " [ 2.6818252 -1.5750014 -0.34812498 ... -4.4137015 -4.250422\n", + " -3.481941 ]\n", + " ...\n", + " [ 1.4232106 2.9689102 3.9547806 ... -0.481165 0.28190404\n", + " -1.2167063 ]\n", + " [ 2.2297084 4.8198485 4.2857304 ... 0.57483846 1.4093391\n", + " 0.0715822 ]\n", + " [ 1.679745 4.768068 5.416195 ... 0.17254728 0.4623217\n", + " 1.4772662 ]]]\n", + "\n", + "\n", + " [[[-2.0860114 -2.9508173 -1.4945896 ... -4.067145 -2.5652342\n", + " -3.5771027 ]\n", + " [-2.697845 -1.9273603 -2.3885014 ... -2.196533 -2.8573706\n", + " -2.0113711 ]\n", + " [-2.413383 -2.7204053 -1.0502659 ... -3.001385 -3.36447\n", + " -4.3225455 ]\n", + " ...\n", + " [ 1.2754489 0.9560999 1.5239805 ... -0.0105865 -1.00876\n", + " 2.6247358 ]\n", + " [ 1.1965859 1.0378222 1.1025598 ... -0.5394704 0.49838027\n", + " -0.9618193 ]\n", + " [ 1.1361816 1.3232857 0.687318 ... -0.23925456 -0.43679112\n", + " -0.79297894]]]]\n", + "param grad: conv.conv_in.conv.bias: shape: [32] stop_grad: False grad: [ 5.9604645e-07 -3.9339066e-06 -1.0728836e-06 -1.6689301e-06\n", + " 1.1920929e-06 -2.5033951e-06 -2.3841858e-07 4.7683716e-07\n", + " 4.2915344e-06 -1.9073486e-06 -1.9073486e-06 3.0994415e-06\n", + " -2.6822090e-06 3.3378601e-06 -4.2915344e-06 5.2452087e-06\n", + " 3.8146973e-06 2.3841858e-07 7.1525574e-07 -3.6954880e-06\n", + " 2.0563602e-06 -2.6226044e-06 3.0994415e-06 -3.5762787e-07\n", + " -4.7683716e-06 1.2218952e-06 3.3378601e-06 -2.5629997e-06\n", + " 2.3841858e-07 -1.7881393e-06 4.7683716e-07 -2.7418137e-06]\n", + "param grad: conv.conv_in.bn.weight: shape: [32] stop_grad: False grad: [ 2.363316 3.286464 1.9607866 -1.6367784 -1.6325372 -1.7729434\n", + " -0.9261875 2.0950415 0.1155543 -0.8857083 0.70079553 0.33920464\n", + " 2.6953902 -0.64524114 0.8845749 -1.2271115 0.6578167 -2.939814\n", + " 5.5728893 -1.0917969 0.01470797 1.395206 4.8009634 -0.744532\n", + " 0.944651 -1.092311 1.4877632 -3.042566 0.51686054 -5.4768667\n", + " -5.628145 -1.0894046 ]\n", + "param grad: conv.conv_in.bn.bias: shape: [32] stop_grad: False grad: [ 1.5193373 1.8838218 3.7722278 0.28052303 0.5386534 -0.44620085\n", + " -1.6977876 3.115642 0.03312349 -2.9121587 3.8925257 0.2288351\n", + " -2.273387 -1.3597974 4.3708124 -0.23374033 0.116272 -0.7064927\n", + " 6.5267463 -1.5318865 1.0288429 0.7928574 -0.24655592 -2.1116853\n", + " 2.922772 -3.3462617 1.7016437 -3.5471547 0.29777628 -3.2820854\n", + " -4.116946 -0.9909375 ]\n", + "param grad: conv.conv_in.bn._mean: shape: [32] stop_grad: True grad: None\n", + "param grad: conv.conv_in.bn._variance: shape: [32] stop_grad: True grad: None\n", + "param grad: conv.conv_stack.0.conv.weight: shape: [32, 32, 21, 11] stop_grad: False grad: [[[[ 6.20494843e-01 5.95983505e-01 -1.48909020e+00 ... -6.86620831e-01\n", + " 6.71104014e-01 -1.95339048e+00]\n", + " [-3.91837955e-03 1.27062631e+00 -1.63248098e+00 ... 1.07290137e+00\n", + " -9.42245364e-01 -3.34277248e+00]\n", + " [ 2.41821265e+00 2.36212373e-01 -1.84433365e+00 ... 1.23182368e+00\n", + " 1.36039746e+00 -2.94621849e+00]\n", + " ...\n", + " [ 1.55153418e+00 7.25861669e-01 2.08785534e+00 ... -6.40172660e-01\n", + " -3.23889256e-02 -2.30832791e+00]\n", + " [ 3.69824195e+00 1.27163112e-01 4.09263194e-01 ... -8.60729575e-01\n", + " -3.51897454e+00 -2.10093403e+00]\n", + " [-4.94779050e-01 -3.74262631e-01 -1.19801068e+00 ... -2.05930543e+00\n", + " -7.38576293e-01 -9.44581270e-01]]\n", + "\n", + " [[-2.04341412e+00 -3.70606273e-01 -1.40429378e+00 ... -1.71711946e+00\n", + " -4.09437418e-01 -1.74107194e+00]\n", + " [-8.72247815e-01 -1.06301677e+00 -9.19306517e-01 ... -2.98976970e+00\n", + " -3.03250861e+00 -2.37099743e+00]\n", + " [-5.00457406e-01 -1.11882675e+00 -5.91526508e-01 ... 4.23921436e-01\n", + " -2.08650708e+00 -1.82109618e+00]\n", + " ...\n", + " [ 2.07773042e+00 1.40735030e-01 -2.60543615e-01 ... -1.55956164e-01\n", + " -1.31862307e+00 -2.07174897e+00]\n", + " [ 7.95007765e-01 1.14988625e-01 -1.43308258e+00 ... 8.29253554e-01\n", + " -9.57888126e-01 -3.82121086e-01]\n", + " [ 8.34397674e-02 1.38636863e+00 -1.21593380e+00 ... -2.65783578e-01\n", + " 1.78124309e-02 -3.40287232e+00]]\n", + "\n", + " [[ 6.27344131e-01 5.71699142e-02 -3.58010936e+00 ... -4.53077674e-01\n", + " 1.65331578e+00 2.58466601e-02]\n", + " [ 2.66681361e+00 2.02069378e+00 -1.52052927e+00 ... 2.94914508e+00\n", + " 1.94632411e+00 -1.06698799e+00]\n", + " [ 1.57839453e+00 -1.03649735e-01 -4.22528505e+00 ... 2.28863955e+00\n", + " 4.27859402e+00 3.66381669e+00]\n", + " ...\n", + " [-2.44603205e+00 -2.09621000e+00 -2.57623529e+00 ... 9.00211930e-01\n", + " 4.30536079e+00 -2.49779320e+00]\n", + " [-2.52187514e+00 -3.36546659e+00 -1.26748765e+00 ... 8.11533451e-01\n", + " 2.55930424e-01 4.50821817e-02]\n", + " [-3.40082574e+00 -3.26924801e+00 -5.86932135e+00 ... -1.18203712e+00\n", + " 1.09565187e+00 -4.96661961e-01]]\n", + "\n", + " ...\n", + "\n", + " [[ 8.20469666e+00 6.96195841e+00 2.73753977e+00 ... 8.34498823e-01\n", + " 2.56748104e+00 1.67592216e+00]\n", + " [ 9.85801792e+00 8.81465149e+00 6.09280396e+00 ... 1.42389655e+00\n", + " 2.92086434e+00 2.08308399e-01]\n", + " [ 8.00702763e+00 7.97301006e+00 4.64527416e+00 ... 8.61916900e-01\n", + " 3.55370259e+00 4.75085378e-01]\n", + " ...\n", + " [ 5.61662769e+00 -4.72857296e-01 -1.04519971e-01 ... -4.03000236e-01\n", + " -1.66419971e+00 -1.70375630e-01]\n", + " [ 4.52409792e+00 -3.70670676e-01 4.54190969e-02 ... -8.20453286e-01\n", + " 9.49141383e-02 8.88008535e-01]\n", + " [ 3.27219462e+00 8.93201411e-01 1.94810414e+00 ... -2.86915004e-02\n", + " 1.93200278e+00 8.19505215e-01]]\n", + "\n", + " [[ 5.84066296e+00 6.72855520e+00 5.21399307e+00 ... 4.55058670e+00\n", + " 3.19132543e+00 3.17435169e+00]\n", + " [ 6.04594421e+00 6.88997173e+00 5.00542831e+00 ... 2.23561144e+00\n", + " 2.76059532e+00 4.83479440e-01]\n", + " [ 5.36118126e+00 4.13896275e+00 3.68701124e+00 ... 3.64462805e+00\n", + " 2.80596399e+00 1.52781498e+00]\n", + " ...\n", + " [ 2.87856674e+00 5.84320784e-01 1.74297714e+00 ... 2.83938944e-01\n", + " -2.26546407e-01 -1.18434143e+00]\n", + " [ 2.08510804e+00 1.74915957e+00 1.58637917e+00 ... 6.41967297e-01\n", + " -1.31319761e-01 -3.85830402e-01]\n", + " [ 4.41666174e+00 2.58244562e+00 2.97712159e+00 ... 1.42317235e-01\n", + " 1.68037796e+00 -6.50003672e-01]]\n", + "\n", + " [[ 1.05511594e+00 6.74880028e-01 -7.64639139e-01 ... -2.15282440e-01\n", + " 2.07197094e+00 4.48752761e-01]\n", + " [ 2.12095881e+00 3.44118834e+00 1.61375272e+00 ... -1.18487728e+00\n", + " 1.88659012e+00 1.48252523e+00]\n", + " [ 8.33427787e-01 4.35035896e+00 -3.59877385e-02 ... 8.70242774e-01\n", + " 3.75945044e+00 -3.09408635e-01]\n", + " ...\n", + " [ 5.08510351e+00 4.73114061e+00 1.97346115e+00 ... -2.25924397e+00\n", + " -1.26373076e+00 -1.37826729e+00]\n", + " [ 6.17275095e+00 4.16016817e+00 3.15675950e+00 ... -2.02416754e+00\n", + " 1.50002241e-02 1.84633851e+00]\n", + " [ 7.32995272e+00 5.34601831e+00 4.58857203e+00 ... -1.88874304e+00\n", + " 1.53240371e+00 7.47349262e-02]]]\n", + "\n", + "\n", + " [[[-1.80918843e-01 -2.52616453e+00 -2.78145695e+00 ... 1.44283652e+00\n", + " -1.08945215e+00 4.19084758e-01]\n", + " [-9.66833949e-01 -2.41106153e+00 -3.48886085e+00 ... -1.87193304e-01\n", + " 8.21905077e-01 1.89097953e+00]\n", + " [-1.59118319e+00 -2.56997013e+00 -3.10426521e+00 ... 2.05900550e+00\n", + " -2.78253704e-01 6.96343541e-01]\n", + " ...\n", + " [ 6.66302443e-02 -2.00887346e+00 -3.17550874e+00 ... 7.97579706e-01\n", + " -9.71581042e-02 1.71877682e+00]\n", + " [-8.01679730e-01 -2.02678037e+00 -3.21915555e+00 ... 8.35528374e-01\n", + " -1.15296638e+00 4.35728967e-01]\n", + " [ 1.45292446e-01 -2.15479851e+00 -1.51839817e+00 ... -3.07936192e-01\n", + " -5.39051890e-01 1.13107657e+00]]\n", + "\n", + " [[-2.43341160e+00 -3.35346818e+00 -9.87014294e-01 ... 1.34049034e+00\n", + " 2.95773447e-02 1.27177119e+00]\n", + " [-2.61602497e+00 -9.76761580e-01 -2.52060473e-01 ... -1.38134825e+00\n", + " 3.85564029e-01 4.57195908e-01]\n", + " [-2.23676014e+00 -4.00404739e+00 -2.23409963e+00 ... -1.41846514e+00\n", + " -6.58698231e-02 -3.61778140e-01]\n", + " ...\n", + " [-1.13604403e+00 -6.03917837e-02 -4.95491922e-01 ... 2.14673686e+00\n", + " 1.21484184e+00 2.22764325e+00]\n", + " [-1.05162430e+00 -1.59828448e+00 3.15489501e-01 ... 2.28046751e+00\n", + " 2.39702511e+00 2.43942714e+00]\n", + " [-1.27370405e+00 -2.05736399e-01 -1.12124372e+00 ... 2.21597219e+00\n", + " 2.50086927e+00 1.91134131e+00]]\n", + "\n", + " [[-4.53170598e-01 -1.59644139e+00 -3.63470483e+00 ... -4.35066032e+00\n", + " -3.79540777e+00 -1.09796596e+00]\n", + " [-2.21036464e-01 -2.53353834e+00 -1.28269875e+00 ... -3.38615727e+00\n", + " -2.59143281e+00 7.74220943e-01]\n", + " [-6.89323783e-01 -1.44375205e+00 6.66438341e-02 ... -1.30736077e+00\n", + " -1.23293114e+00 1.58148706e+00]\n", + " ...\n", + " [ 1.63751483e+00 -4.08427984e-01 -8.15176964e-01 ... 3.70807743e+00\n", + " 2.04232907e+00 1.97716308e+00]\n", + " [ 2.13261342e+00 1.85947633e+00 -8.06532025e-01 ... 1.98311245e+00\n", + " 2.27003932e+00 -1.11734614e-01]\n", + " [ 1.28702402e+00 3.98628891e-01 -1.63712263e+00 ... 8.00528765e-01\n", + " 5.78273535e-01 -2.59924948e-01]]\n", + "\n", + " ...\n", + "\n", + " [[ 3.96233416e+00 4.66794682e+00 1.39437711e+00 ... 7.52061129e-01\n", + " -1.53534544e+00 -6.67162359e-01]\n", + " [ 2.33841681e+00 3.35811281e+00 9.80114818e-01 ... 1.48806703e+00\n", + " 2.68609226e-01 -1.35124445e+00]\n", + " [ 2.08177710e+00 4.28519583e+00 1.52450514e+00 ... 7.45321214e-01\n", + " -5.04359961e-01 -1.81241560e+00]\n", + " ...\n", + " [ 2.95398951e-01 4.30877179e-01 -2.03731894e+00 ... -4.20221925e-01\n", + " 3.29260826e-01 5.83679557e-01]\n", + " [ 1.30742240e+00 -6.32183790e-01 -3.13741422e+00 ... 9.63868052e-02\n", + " 2.91730791e-01 1.33400351e-01]\n", + " [ 5.43292165e-01 -2.83665359e-01 -1.88138187e+00 ... 2.15468198e-01\n", + " 4.90157723e-01 2.40562439e+00]]\n", + "\n", + " [[ 1.57632053e+00 6.27885723e+00 2.87853765e+00 ... 3.07016110e+00\n", + " 1.91490650e+00 1.76274943e+00]\n", + " [ 2.57776356e+00 4.07256317e+00 2.52231169e+00 ... 4.09494352e+00\n", + " 2.53548074e+00 2.44395185e+00]\n", + " [ 2.43037057e+00 4.35728836e+00 1.96233964e+00 ... 2.26702976e+00\n", + " 2.94634581e+00 2.21452284e+00]\n", + " ...\n", + " [-2.72509992e-01 -8.41220498e-01 -1.89133918e+00 ... -1.80079627e+00\n", + " -2.00367713e+00 -7.09145784e-01]\n", + " [ 8.21575999e-01 -1.13323164e+00 -2.62418866e+00 ... -2.38889670e+00\n", + " -7.83945560e-01 -1.01922750e-01]\n", + " [-1.14730227e+00 -1.42182577e+00 -2.00993991e+00 ... -2.11025667e+00\n", + " 1.60286129e-02 -7.26446986e-01]]\n", + "\n", + " [[ 4.20389509e+00 3.75917768e+00 4.97653627e+00 ... 1.23642838e+00\n", + " 8.52760911e-01 1.27920091e-01]\n", + " [ 5.29409122e+00 5.29002380e+00 3.96404648e+00 ... 1.91227329e+00\n", + " 3.97556186e-01 1.69182217e+00]\n", + " [ 4.60112572e+00 4.12772799e+00 2.10280085e+00 ... 3.24303842e+00\n", + " -1.07720590e+00 -3.81854475e-01]\n", + " ...\n", + " [ 1.81884170e-02 -3.11472058e+00 -8.23525012e-01 ... -2.40161085e+00\n", + " -4.48192549e+00 -6.14600539e-01]\n", + " [ 1.16305006e+00 -1.15409636e+00 -3.48765063e+00 ... -1.97504926e+00\n", + " -4.44984436e+00 -2.28429958e-01]\n", + " [ 1.29197860e+00 6.17720246e-01 -5.87171853e-01 ... -1.35258228e-01\n", + " -1.29259872e+00 1.30360842e-01]]]\n", + "\n", + "\n", + " [[[-1.26687372e+00 -2.33633637e+00 -1.49625254e+00 ... 2.52396107e+00\n", + " -6.68072224e-01 -1.13282454e+00]\n", + " [-1.34229445e+00 -2.87080932e+00 -2.57388353e+00 ... -8.75385761e-01\n", + " -1.00205469e+00 -3.58956242e+00]\n", + " [-9.49853599e-01 -5.78684711e+00 -3.52962446e+00 ... 8.88233304e-01\n", + " 2.25133196e-01 -1.02802217e+00]\n", + " ...\n", + " [-7.38113701e-01 -3.47510982e+00 -3.23011065e+00 ... -1.25624001e+00\n", + " -1.63268471e+00 6.00247443e-01]\n", + " [-2.29733467e+00 -5.72547615e-01 -1.98301303e+00 ... -1.90137398e+00\n", + " -1.47013855e+00 -1.45779204e+00]\n", + " [-2.24628520e+00 -3.36337948e+00 -3.91878939e+00 ... -1.53652275e+00\n", + " -1.36285520e+00 -1.68160331e+00]]\n", + "\n", + " [[-8.11348319e-01 -7.17824280e-01 -1.02243233e+00 ... -2.69050407e+00\n", + " -2.32403350e+00 -4.25943947e+00]\n", + " [-2.35056520e+00 -2.35941172e+00 -1.24398732e+00 ... -2.08313870e+00\n", + " -1.16508257e+00 -1.30353463e+00]\n", + " [-2.25146723e+00 -1.94972813e+00 -1.13295293e+00 ... -2.61496377e+00\n", + " -1.91106403e+00 -1.07801402e+00]\n", + " ...\n", + " [-2.67012739e+00 -3.20916414e+00 -2.41768575e+00 ... 2.65138328e-01\n", + " -5.27612507e-01 1.44604075e+00]\n", + " [-3.54237866e+00 -3.62832785e+00 -2.40270257e+00 ... -9.76106226e-02\n", + " 4.67946082e-01 -7.24248111e-01]\n", + " [-2.49844384e+00 -3.42463255e+00 -2.99040008e+00 ... 4.28889185e-01\n", + " -7.51657963e-01 -1.00530767e+00]]\n", + "\n", + " [[-8.42589438e-02 1.42022014e-01 -8.51281703e-01 ... 4.21745628e-01\n", + " -2.35717297e-02 -1.71374834e+00]\n", + " [-1.05496287e+00 3.82416457e-01 -4.40595537e-01 ... 1.03381336e-01\n", + " -1.41204190e+00 -7.58325040e-01]\n", + " [-2.28930283e+00 -2.03857040e+00 -9.16261196e-01 ... -3.94939929e-01\n", + " -1.07798588e+00 -1.48433352e+00]\n", + " ...\n", + " [-3.11473966e-01 -1.40877593e+00 -2.42908645e+00 ... 7.88682699e-01\n", + " 1.24199319e+00 1.89949930e-01]\n", + " [ 5.44084549e-01 -1.02425671e+00 -1.53991556e+00 ... -4.36764538e-01\n", + " -5.78772545e-01 2.62665659e-01]\n", + " [ 1.26812792e+00 -9.89493608e-01 -1.47972977e+00 ... 2.21440494e-02\n", + " 2.79776216e-01 7.63269484e-01]]\n", + "\n", + " ...\n", + "\n", + " [[ 6.02095068e-01 5.93243122e-01 -1.06838238e+00 ... 3.56546330e+00\n", + " 1.16390383e+00 -1.47593319e-01]\n", + " [ 1.80458140e+00 1.68401957e+00 4.17516947e-01 ... 3.33444500e+00\n", + " 1.89411759e+00 1.03220642e-01]\n", + " [ 2.74264169e+00 2.92038846e+00 1.00775683e+00 ... 3.53285050e+00\n", + " 2.07282662e+00 -2.56800652e-01]\n", + " ...\n", + " [ 4.88933468e+00 3.72433925e+00 3.58677816e+00 ... 1.98363388e+00\n", + " 1.80851030e+00 8.32634747e-01]\n", + " [ 4.01546288e+00 4.78934765e+00 2.94778132e+00 ... 2.99637699e+00\n", + " 1.30439472e+00 3.61029744e-01]\n", + " [ 3.13628030e+00 2.01894832e+00 2.82585931e+00 ... 2.54264188e+00\n", + " -9.16651785e-02 9.93353873e-02]]\n", + "\n", + " [[ 2.35585642e+00 8.42678428e-01 1.57331872e+00 ... 3.65935063e+00\n", + " 3.94066262e+00 4.89832020e+00]\n", + " [ 1.85791731e+00 1.34373701e+00 1.30812299e+00 ... 2.71434736e+00\n", + " 3.22004294e+00 2.99872303e+00]\n", + " [ 1.67675853e+00 -4.05569375e-02 1.85539150e+00 ... 3.73934364e+00\n", + " 2.98195982e+00 3.37315011e+00]\n", + " ...\n", + " [ 2.14539170e+00 2.86586595e+00 2.20222116e+00 ... 1.20492995e+00\n", + " 2.13971066e+00 1.94932449e+00]\n", + " [ 4.68422651e+00 3.80044746e+00 4.23209000e+00 ... 2.40658951e+00\n", + " 2.29117441e+00 2.52368808e+00]\n", + " [ 3.10694575e+00 2.49402595e+00 4.53786707e+00 ... 9.08902645e-01\n", + " 1.86903965e+00 2.27776885e+00]]\n", + "\n", + " [[ 1.45200038e+00 5.17961740e-01 -1.58403587e+00 ... 5.07019472e+00\n", + " 7.87163258e-01 1.20610237e+00]\n", + " [ 3.39321136e+00 2.21043849e+00 -6.31202877e-01 ... 4.97822762e+00\n", + " 9.66498017e-01 1.18883348e+00]\n", + " [ 1.20627856e+00 1.82759428e+00 5.91053367e-01 ... 4.14318657e+00\n", + " 5.25399208e-01 -1.16850233e+00]\n", + " ...\n", + " [ 1.05183899e+00 5.80030501e-01 1.89724147e+00 ... 2.54626465e+00\n", + " -1.49128008e+00 -1.85064209e+00]\n", + " [ 1.50983357e+00 2.85973406e+00 2.61224055e+00 ... 4.83481932e+00\n", + " 9.67048705e-02 -4.37043965e-01]\n", + " [ 2.57720876e+00 2.09961963e+00 4.11754288e-02 ... 3.80421424e+00\n", + " -7.83308804e-01 -1.64871216e+00]]]\n", + "\n", + "\n", + " ...\n", + "\n", + "\n", + " [[[-1.16345096e+00 -2.53971386e+00 -8.99101734e-01 ... -4.35583591e-01\n", + " -1.29671764e+00 -1.61429560e+00]\n", + " [ 3.72841507e-01 3.45808208e-01 -1.82167351e+00 ... -2.14515448e+00\n", + " -1.26383066e+00 -2.27464601e-01]\n", + " [ 1.58568513e+00 2.58181524e+00 1.86554670e+00 ... -1.10401320e+00\n", + " -3.68550658e-01 -2.58849680e-01]\n", + " ...\n", + " [-9.15827155e-01 -1.25424683e+00 -4.04716206e+00 ... 2.13138080e+00\n", + " 2.67662477e+00 2.31014514e+00]\n", + " [-3.19453120e-01 -6.71132684e-01 -1.51378751e+00 ... 1.86080432e+00\n", + " 2.77418542e+00 1.22875953e+00]\n", + " [-1.20453942e+00 -3.93669218e-01 -1.51751983e+00 ... 1.17620552e+00\n", + " 1.95602298e+00 7.64306366e-01]]\n", + "\n", + " [[-8.73186827e-01 -2.12537169e+00 -1.91664994e+00 ... -2.90821463e-01\n", + " 1.90896463e+00 8.02283168e-01]\n", + " [-1.06389821e+00 -2.15300727e+00 -1.82113051e+00 ... -4.34280694e-01\n", + " 1.53455496e+00 1.94702053e+00]\n", + " [-2.08403468e+00 -4.72900331e-01 -1.10610819e+00 ... -8.79420400e-01\n", + " 7.79394627e-01 2.02670670e+00]\n", + " ...\n", + " [-4.28208113e-01 -7.90894389e-01 -1.06713009e+00 ... 1.12579381e+00\n", + " 9.61961091e-01 1.40342009e+00]\n", + " [ 4.40416574e-01 -1.65901780e-02 -1.05338669e+00 ... 1.40698349e+00\n", + " 9.43485856e-01 2.34856772e+00]\n", + " [-1.20572495e+00 -2.03134632e+00 4.88817632e-01 ... 2.20770907e+00\n", + " 1.38143206e+00 2.00714707e+00]]\n", + "\n", + " [[ 9.00486887e-01 -9.50459957e-01 -1.42935121e+00 ... -1.30648065e+00\n", + " -2.52133775e+00 -8.87715697e-01]\n", + " [ 3.73431134e+00 1.69571114e+00 5.99429727e-01 ... 6.64332986e-01\n", + " -6.10453069e-01 2.06534386e+00]\n", + " [ 1.59800696e+00 -4.59622175e-01 -6.73136234e-01 ... 2.18770742e-01\n", + " -1.12928271e+00 4.87097502e-02]\n", + " ...\n", + " [ 1.92336845e+00 1.37130380e-01 -3.51048648e-01 ... 5.41638851e-01\n", + " 1.06069386e+00 1.36404145e+00]\n", + " [ 1.29641414e+00 -2.79530913e-01 -2.63607264e-01 ... -8.62445176e-01\n", + " 1.48393130e+00 2.69196725e+00]\n", + " [ 1.14442182e+00 -1.24098969e+00 3.70959163e-01 ... -1.12241995e+00\n", + " 3.67927134e-01 2.55976987e+00]]\n", + "\n", + " ...\n", + "\n", + " [[ 5.32017851e+00 3.64207411e+00 3.84571218e+00 ... 3.60754800e+00\n", + " 2.57500267e+00 -1.38083458e-01]\n", + " [ 5.69058084e+00 3.93056583e+00 2.93337941e+00 ... 3.17091584e+00\n", + " 2.34770632e+00 6.48133337e-01]\n", + " [ 5.98239613e+00 6.16548634e+00 3.04750896e+00 ... 5.51510525e+00\n", + " 4.34810448e+00 1.31588542e+00]\n", + " ...\n", + " [ 5.09930992e+00 3.32360983e+00 2.29228449e+00 ... 3.45123887e-01\n", + " 1.06280947e+00 -5.93325794e-02]\n", + " [ 4.19760656e+00 3.97779059e+00 1.66905916e+00 ... 3.68937254e-01\n", + " 8.06131065e-02 8.08142900e-01]\n", + " [ 4.52498960e+00 3.45109749e+00 1.01074433e+00 ... -2.54036248e-01\n", + " 3.13675582e-01 2.13851762e+00]]\n", + "\n", + " [[ 6.93927193e+00 6.05758238e+00 4.60648441e+00 ... 4.32221603e+00\n", + " 3.17874146e+00 1.47012353e+00]\n", + " [ 7.88523865e+00 6.62228966e+00 4.77496338e+00 ... 4.45868683e+00\n", + " 2.73698759e+00 2.17057824e+00]\n", + " [ 7.12061214e+00 6.01714134e+00 4.52996492e+00 ... 3.97184372e+00\n", + " 3.43153954e+00 1.21802723e+00]\n", + " ...\n", + " [ 2.85720730e+00 1.89639473e+00 1.96340394e+00 ... 1.89643729e+00\n", + " 1.64856291e+00 1.15853786e+00]\n", + " [ 3.88248491e+00 2.16386199e+00 1.53069091e+00 ... 2.71704245e+00\n", + " 2.24890351e+00 2.22156644e+00]\n", + " [ 5.27136230e+00 1.68400204e+00 2.09500480e+00 ... 2.75956345e+00\n", + " 3.71970820e+00 1.69852686e+00]]\n", + "\n", + " [[ 2.55598164e+00 1.64588141e+00 6.70431674e-01 ... 3.24091220e+00\n", + " 1.48759770e+00 -1.72001183e+00]\n", + " [ 4.33942318e+00 8.40826690e-01 -7.40000725e-01 ... 7.24577069e-01\n", + " 1.74327165e-01 -1.83029580e+00]\n", + " [ 4.39864540e+00 2.28395438e+00 -1.90353513e-01 ... 5.58019161e+00\n", + " 1.05627227e+00 -8.02519619e-01]\n", + " ...\n", + " [ 1.97654784e+00 3.26888156e+00 1.52879453e+00 ... 3.15013933e+00\n", + " 4.66731453e+00 4.98701715e+00]\n", + " [ 1.40016854e+00 3.45761251e+00 3.68359756e+00 ... 1.14207900e+00\n", + " 3.32219076e+00 3.83035636e+00]\n", + " [ 1.99269783e+00 2.15428829e+00 3.35396528e-01 ... 2.45916694e-01\n", + " 2.13785577e+00 4.33214951e+00]]]\n", + "\n", + "\n", + " [[[ 1.35320330e+00 5.05850911e-02 1.04915988e+00 ... 1.82023585e-01\n", + " 2.72914767e-01 3.92112255e-01]\n", + " [ 1.04646444e+00 7.60913491e-01 1.93323612e+00 ... 1.19493449e+00\n", + " -1.44200325e-01 4.07531261e-02]\n", + " [-9.88207340e-01 -1.46165287e+00 1.05884135e-01 ... -3.23057353e-01\n", + " -2.28934169e+00 -7.38609374e-01]\n", + " ...\n", + " [ 1.01198792e+00 2.34331083e+00 1.04566610e+00 ... 1.29697472e-01\n", + " -1.23878837e+00 2.21006930e-01]\n", + " [-3.75360101e-01 1.53673506e+00 -1.32206869e+00 ... -2.55255580e-01\n", + " -6.22699618e-01 -1.73162484e+00]\n", + " [ 4.34735864e-01 5.08327007e-01 -3.49233925e-01 ... -1.04749084e+00\n", + " -1.15777385e+00 -1.13671994e+00]]\n", + "\n", + " [[ 1.67839336e+00 -1.80224836e-01 1.02194118e+00 ... 8.44027162e-01\n", + " 8.81283879e-02 -1.37762165e+00]\n", + " [ 8.39694083e-01 1.32322550e+00 4.02442753e-01 ... -4.21785116e-01\n", + " -9.98012185e-01 -1.11348581e+00]\n", + " [ 7.64424682e-01 8.58965695e-01 2.94626594e-01 ... -6.65519595e-01\n", + " -3.65677416e-01 -2.25250268e+00]\n", + " ...\n", + " [-1.10193872e+00 1.18070498e-01 1.04604781e-01 ... -1.44486964e+00\n", + " -2.52748466e+00 -2.16131711e+00]\n", + " [-1.06079710e+00 -1.48379254e+00 3.80138367e-01 ... -1.62288392e+00\n", + " -2.44736362e+00 -8.78590107e-01]\n", + " [ 3.44401300e-02 -2.60935068e+00 -2.35597759e-01 ... -2.41114974e+00\n", + " -2.45255780e+00 -1.82384634e+00]]\n", + "\n", + " [[ 1.37670958e+00 1.58661580e+00 -2.85664916e-01 ... 1.49081087e+00\n", + " 4.13422853e-01 1.12761199e+00]\n", + " [ 1.54148173e+00 6.22704089e-01 1.41886568e+00 ... 1.59678531e+00\n", + " -8.72656107e-01 1.52415514e-01]\n", + " [ 3.30207205e+00 2.89925170e+00 1.91855145e+00 ... 3.18863559e+00\n", + " 1.87347198e+00 9.48901057e-01]\n", + " ...\n", + " [-1.53920484e+00 1.77375078e-02 -1.02018684e-01 ... 1.94011092e+00\n", + " -6.83587790e-01 1.49154460e+00]\n", + " [-2.27719522e+00 1.02481163e+00 -2.11300224e-01 ... -8.18020821e-01\n", + " 1.54248989e+00 -1.46732473e+00]\n", + " [-4.50206220e-01 3.62383485e+00 1.07175660e+00 ... 4.25961137e-01\n", + " 1.12405360e-01 -6.87821358e-02]]\n", + "\n", + " ...\n", + "\n", + " [[-3.40477467e-01 -2.99311423e+00 -2.12096786e+00 ... 2.27393007e+00\n", + " 4.03424358e+00 3.73335361e+00]\n", + " [-6.99971199e-01 -2.97719741e+00 -2.72910309e+00 ... 1.50101089e+00\n", + " 2.29408574e+00 3.14105940e+00]\n", + " [-1.41648722e+00 -1.86292887e+00 -1.84006739e+00 ... 2.78402638e+00\n", + " 3.91481900e+00 5.32456112e+00]\n", + " ...\n", + " [ 5.97958088e-01 1.50512588e+00 6.23718500e-01 ... 2.83813477e+00\n", + " 3.87909842e+00 3.33359623e+00]\n", + " [ 1.65542316e+00 3.56163192e+00 4.01527691e+00 ... 3.38367462e+00\n", + " 1.55827272e+00 2.50741863e+00]\n", + " [ 2.82036042e+00 2.53322673e+00 4.38798475e+00 ... 4.64642382e+00\n", + " 3.28739667e+00 3.02895570e+00]]\n", + "\n", + " [[-3.47941303e+00 -3.49006844e+00 -2.25583363e+00 ... 1.45181656e-01\n", + " 1.52944064e+00 2.08810711e+00]\n", + " [-2.27786446e+00 -4.59218550e+00 -2.74722624e+00 ... -1.73136210e+00\n", + " 7.46028006e-01 1.74789345e+00]\n", + " [-3.35524082e+00 -4.58244705e+00 -2.40820456e+00 ... -5.04051924e-01\n", + " 1.49640536e+00 2.16613841e+00]\n", + " ...\n", + " [ 5.26107132e-01 2.05329061e+00 2.84252572e+00 ... 1.33222675e+00\n", + " 3.87935114e+00 3.69385266e+00]\n", + " [ 4.38092083e-01 2.15028906e+00 3.13363624e+00 ... 3.36048746e+00\n", + " 5.36551809e+00 2.94915986e+00]\n", + " [ 2.75497317e+00 3.25929213e+00 2.33522987e+00 ... 1.69926262e+00\n", + " 3.93462896e+00 3.68200874e+00]]\n", + "\n", + " [[ 1.10951948e+00 5.31419516e-02 -1.58864903e+00 ... 5.24887085e+00\n", + " 1.60273385e+00 4.90113163e+00]\n", + " [-2.94517064e+00 -2.81092644e+00 -4.89631557e+00 ... 3.99868512e+00\n", + " 1.40544355e+00 2.84833241e+00]\n", + " [-3.51893663e-01 -3.53325534e+00 -2.21239805e+00 ... 4.26225853e+00\n", + " 6.87886119e-01 2.58609629e+00]\n", + " ...\n", + " [ 2.92248201e+00 5.40264511e+00 4.65721560e+00 ... 5.24537373e+00\n", + " 2.30406880e+00 1.29892707e+00]\n", + " [ 1.43473256e+00 4.61167526e+00 3.57578802e+00 ... 5.12181854e+00\n", + " 8.59923482e-01 1.38731599e+00]\n", + " [-6.50881350e-01 2.18233657e+00 2.74669623e+00 ... 4.86368895e+00\n", + " 1.44120216e+00 1.79993320e+00]]]\n", + "\n", + "\n", + " [[[ 1.64106202e+00 3.54410499e-01 -3.54172409e-01 ... 2.32646990e+00\n", + " 1.65043330e+00 3.45897645e-01]\n", + " [ 2.16236949e+00 1.28213906e+00 2.26082468e+00 ... 6.10507369e-01\n", + " 9.12241280e-01 1.27429694e-01]\n", + " [ 2.07962990e+00 7.03816175e-01 2.01272345e+00 ... -2.26959705e-01\n", + " 1.00041127e+00 5.87104559e-02]\n", + " ...\n", + " [-1.62972426e+00 -3.04028845e+00 -1.39124167e+00 ... 2.47561097e+00\n", + " 2.35047388e+00 1.61532843e+00]\n", + " [-1.97368932e+00 -5.44541061e-01 -5.92882216e-01 ... 1.39800012e+00\n", + " 2.32770801e+00 9.96662021e-01]\n", + " [-1.15636075e+00 -1.34654212e+00 -8.50648999e-01 ... 1.85655832e+00\n", + " 2.05776072e+00 5.34575820e-01]]\n", + "\n", + " [[-1.02104437e+00 3.08469892e-01 2.81789303e-01 ... -8.24654043e-01\n", + " -9.85817850e-01 -2.05517030e+00]\n", + " [ 9.50192690e-01 3.35105330e-01 5.31637192e-01 ... -1.42974198e-01\n", + " -1.79659498e+00 -1.58266973e+00]\n", + " [-2.51316994e-01 -1.28709340e+00 3.01498562e-01 ... -1.32253516e+00\n", + " -1.55507576e+00 -9.37123299e-01]\n", + " ...\n", + " [ 2.33016998e-01 2.92454743e+00 3.15420461e+00 ... 1.15574491e+00\n", + " 1.27850962e+00 1.35487700e+00]\n", + " [ 3.81013602e-01 1.44239831e+00 6.64825320e-01 ... -3.89374971e-01\n", + " 1.50716826e-01 1.33641326e+00]\n", + " [ 1.71373415e+00 1.67357373e+00 1.76596940e+00 ... 1.57941079e+00\n", + " 1.60940981e+00 1.78091609e+00]]\n", + "\n", + " [[-5.16522598e+00 -1.68099070e+00 -3.24440050e+00 ... -3.46229005e+00\n", + " -2.18273020e+00 -1.98621082e+00]\n", + " [-3.05743694e+00 9.15392339e-01 -1.93508530e+00 ... -1.82306373e+00\n", + " -2.12960863e+00 -3.45255351e+00]\n", + " [-4.32777822e-01 -1.00303245e+00 -1.61397791e+00 ... -2.08376765e+00\n", + " -3.72989595e-01 -1.36516929e+00]\n", + " ...\n", + " [-5.83641946e-01 4.14125490e+00 1.58227599e+00 ... 2.03144050e+00\n", + " 2.13982654e+00 -1.81909311e+00]\n", + " [-1.74230576e+00 2.39347410e+00 2.44080925e+00 ... 5.43732524e-01\n", + " 2.07899213e+00 -3.71748984e-01]\n", + " [ 3.80016506e-01 7.84988403e-01 1.20596504e+00 ... -2.32057095e+00\n", + " -2.81265080e-01 -3.69353056e+00]]\n", + "\n", + " ...\n", + "\n", + " [[-3.48024845e+00 -2.60937548e+00 -3.84952760e+00 ... 6.68736577e-01\n", + " -1.75104141e-02 -3.54720926e+00]\n", + " [-2.59637117e+00 -5.18190145e+00 -2.33887696e+00 ... 9.13373232e-02\n", + " -3.58282638e+00 -2.40778995e+00]\n", + " [-2.50912881e+00 -1.22113395e+00 -2.34372020e+00 ... 1.40071487e+00\n", + " -1.67449510e+00 -1.14655948e+00]\n", + " ...\n", + " [-5.75253534e+00 -6.67348385e+00 -5.05184650e+00 ... -2.73145151e+00\n", + " -1.48933101e+00 -1.36807609e+00]\n", + " [-3.29049587e+00 -3.73956156e+00 -2.85064268e+00 ... -3.92481357e-01\n", + " -8.00529659e-01 -8.39800835e-01]\n", + " [-4.30351114e+00 -4.21471930e+00 -2.41703367e+00 ... -1.27081513e+00\n", + " 1.67839837e+00 8.47821474e-01]]\n", + "\n", + " [[-5.27856112e-01 -1.09752083e+00 3.39107156e-01 ... 2.00062895e+00\n", + " 8.83528054e-01 2.57416844e-01]\n", + " [-1.58655810e+00 -3.36268663e-01 1.16161990e+00 ... 1.54868484e+00\n", + " 2.38878536e+00 1.84097290e+00]\n", + " [ 5.96052647e-01 2.15484858e-01 1.85280466e+00 ... 2.74587560e+00\n", + " 1.61432290e+00 1.13214278e+00]\n", + " ...\n", + " [-4.57659864e+00 -5.42679739e+00 -4.35204458e+00 ... -1.82452416e+00\n", + " -2.18670201e+00 -3.91811800e+00]\n", + " [-1.32477629e+00 -4.19110394e+00 -3.41308069e+00 ... 1.39622003e-01\n", + " -1.59393203e+00 -9.08105671e-01]\n", + " [-3.60161018e+00 -4.05932713e+00 -2.23674798e+00 ... 9.09647286e-01\n", + " 9.73127842e-01 1.19991803e+00]]\n", + "\n", + " [[ 2.04062796e+00 7.95603275e-01 -1.28833270e+00 ... 4.64749050e+00\n", + " 2.25974560e+00 1.02396965e+00]\n", + " [ 1.68882537e+00 2.63353348e+00 2.53597498e-02 ... 4.69063854e+00\n", + " -4.19382691e-01 2.91669458e-01]\n", + " [ 7.71395087e-01 1.20833695e+00 -2.58601785e-01 ... 1.21794045e+00\n", + " -1.51922226e-01 7.44265199e-01]\n", + " ...\n", + " [-6.66095781e+00 -4.81577682e+00 -5.39921665e+00 ... -2.20548606e+00\n", + " 5.72486281e-01 -4.35207397e-01]\n", + " [-7.51608658e+00 -6.67776871e+00 -3.73199415e+00 ... -1.70327055e+00\n", + " 1.01334639e-02 -3.20627165e+00]\n", + " [-5.73050356e+00 -2.74379373e+00 -3.70248461e+00 ... -1.09794116e+00\n", + " -1.73590891e-02 -1.80156028e+00]]]]\n", + "param grad: conv.conv_stack.0.conv.bias: shape: [32] stop_grad: False grad: [-1.4305115e-06 0.0000000e+00 -4.0531158e-06 -1.6689301e-06\n", + " 2.3841858e-07 -7.1525574e-07 1.1920929e-06 1.5497208e-06\n", + " -2.3841858e-07 1.6689301e-06 9.5367432e-07 9.5367432e-07\n", + " -2.6226044e-06 1.1920929e-06 1.3113022e-06 1.9669533e-06\n", + " -4.7683716e-07 1.1920929e-06 -1.6689301e-06 -1.5497208e-06\n", + " -2.2649765e-06 4.7683716e-07 2.3841858e-06 -3.5762787e-06\n", + " 2.3841858e-07 2.1457672e-06 -3.5762787e-07 8.3446503e-07\n", + " -3.5762787e-07 -7.1525574e-07 2.6524067e-06 -1.1920929e-06]\n", + "param grad: conv.conv_stack.0.bn.weight: shape: [32] stop_grad: False grad: [-3.7669735 1.5226867 1.759756 4.501629 -2.2077336 0.18411277\n", + " 1.3558264 -1.0269645 3.9628277 3.9300344 -2.80754 1.8462183\n", + " -0.03385968 2.1284049 0.46124816 -4.364863 0.78491163 0.25565645\n", + " -5.3538237 3.2606194 0.79100513 -1.4652673 2.769378 1.2283417\n", + " -4.7466464 -1.3404545 -6.9374166 0.710248 2.0944448 0.4334769\n", + " -0.24313992 0.31392363]\n", + "param grad: conv.conv_stack.0.bn.bias: shape: [32] stop_grad: False grad: [-0.6251638 2.833331 0.6993131 3.7106915 -2.262496 0.7390424\n", + " 0.5360477 -2.803875 2.1646228 2.117193 -1.9988279 1.5135905\n", + " -2.0181084 2.6450465 0.06302822 -3.0530102 1.4788482 0.5941844\n", + " -3.1690063 1.8753575 -0.0737313 -2.7806277 -0.04483938 0.16129279\n", + " -1.2960215 -0.38020235 -0.55218065 0.10754502 2.065371 -1.4703183\n", + " -0.40964937 -1.4454535 ]\n", + "param grad: conv.conv_stack.0.bn._mean: shape: [32] stop_grad: True grad: None\n", + "param grad: conv.conv_stack.0.bn._variance: shape: [32] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.0.fw_fc.weight: shape: [1312, 1024] stop_grad: False grad: [[-0.46178514 0.1095643 0.06441769 ... 0.42020613 -0.34181893\n", + " -0.0658682 ]\n", + " [-0.03619978 0.21653323 0.01727325 ... 0.05731536 -0.37822944\n", + " -0.05464617]\n", + " [-0.32397318 0.04158126 -0.08091418 ... 0.0928297 -0.06518176\n", + " -0.40110156]\n", + " ...\n", + " [-0.2702023 0.05126935 0.11825457 ... 0.0069707 -0.36951366\n", + " 0.37071258]\n", + " [-0.11326203 0.19305304 -0.133317 ... -0.13030824 -0.09068564\n", + " 0.32735693]\n", + " [-0.04543798 0.09902512 -0.10745425 ... -0.06685166 -0.3055201\n", + " 0.0752247 ]]\n", + "param grad: rnn.rnn_stacks.0.fw_bn.weight: shape: [1024] stop_grad: False grad: [-0.07338604 0.64991236 0.5465856 ... 0.507725 0.14061031\n", + " 0.3020359 ]\n", + "param grad: rnn.rnn_stacks.0.fw_bn.bias: shape: [1024] stop_grad: False grad: [-0.41395143 -0.28493872 0.36796764 ... 0.2387953 0.06732331\n", + " 0.16263628]\n", + "param grad: rnn.rnn_stacks.0.fw_bn._mean: shape: [1024] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.0.fw_bn._variance: shape: [1024] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.0.fw_cell.weight_hh: shape: [1024, 1024] stop_grad: False grad: [[-0.09370177 -0.12264141 -0.08237482 ... -0.50241685 -0.149155\n", + " -0.25661892]\n", + " [-0.37426725 0.44987115 0.10685667 ... -0.65946174 -0.4499248\n", + " -0.17545304]\n", + " [-0.03753807 0.33422717 0.12750985 ... 0.05405155 -0.17648363\n", + " 0.05315325]\n", + " ...\n", + " [ 0.15721183 0.03064088 -0.00751081 ... 0.27183983 0.3881693\n", + " -0.01544908]\n", + " [ 0.26047793 0.16917065 0.00915196 ... 0.18076143 -0.05080506\n", + " 0.14791614]\n", + " [ 0.19052255 0.03642382 -0.14313167 ... 0.2611448 0.20763844\n", + " 0.26846847]]\n", + "param grad: rnn.rnn_stacks.0.fw_cell.bias_hh: shape: [1024] stop_grad: False grad: [-0.4139514 -0.28493875 0.36796758 ... 0.23879525 0.06732336\n", + " 0.16263627]\n", + "param grad: rnn.rnn_stacks.0.bw_cell.weight_hh: shape: [1024, 1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.0.bw_cell.bias_hh: shape: [1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.1.fw_fc.weight: shape: [2048, 1024] stop_grad: False grad: [[ 0.04214853 -0.1710323 0.17557406 ... 0.11926915 0.21577051\n", + " -0.30598596]\n", + " [-0.02370887 -0.03498494 -0.05991999 ... -0.06049232 -0.14527473\n", + " -0.5335691 ]\n", + " [-0.21417995 -0.10263194 -0.05903128 ... -0.26958284 0.05936668\n", + " 0.25522667]\n", + " ...\n", + " [ 0.31594425 -0.29487017 0.15871571 ... 0.3504135 -0.1418606\n", + " -0.07482046]\n", + " [ 0.22316164 0.7682122 -0.22191924 ... -0.00535548 -0.6497105\n", + " -0.2011079 ]\n", + " [-0.05800886 0.13750821 0.02450509 ... 0.245736 0.07425706\n", + " -0.17761081]]\n", + "param grad: rnn.rnn_stacks.1.fw_bn.weight: shape: [1024] stop_grad: False grad: [-0.45080703 0.19005743 0.077441 ... -0.24504453 0.19666554\n", + " -0.10503208]\n", + "param grad: rnn.rnn_stacks.1.fw_bn.bias: shape: [1024] stop_grad: False grad: [-0.55867654 0.04237206 0.03389215 ... -0.35602498 0.25528812\n", + " 0.11344345]\n", + "param grad: rnn.rnn_stacks.1.fw_bn._mean: shape: [1024] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.1.fw_bn._variance: shape: [1024] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.1.fw_cell.weight_hh: shape: [1024, 1024] stop_grad: False grad: [[-0.48457903 0.04466334 -0.19785863 ... -0.0254025 -0.10338341\n", + " -0.29202533]\n", + " [-0.15261276 0.00412052 0.22198747 ... 0.22460426 -0.03752084\n", + " 0.05170784]\n", + " [-0.09337254 0.02530848 0.1263681 ... -0.02056236 0.33342454\n", + " -0.08760723]\n", + " ...\n", + " [-0.28645608 -0.19169135 -0.1361257 ... -0.00444204 -0.06552711\n", + " -0.14726155]\n", + " [ 0.21883707 0.2049045 0.23723911 ... 0.4626113 -0.14110637\n", + " 0.02569831]\n", + " [ 0.37554163 -0.19249167 0.14591683 ... 0.25602737 0.40088275\n", + " 0.41056633]]\n", + "param grad: rnn.rnn_stacks.1.fw_cell.bias_hh: shape: [1024] stop_grad: False grad: [-0.55867654 0.04237211 0.0338921 ... -0.35602498 0.2552881\n", + " 0.11344352]\n", + "param grad: rnn.rnn_stacks.1.bw_cell.weight_hh: shape: [1024, 1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.1.bw_cell.bias_hh: shape: [1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.2.fw_fc.weight: shape: [2048, 1024] stop_grad: False grad: [[-0.28007814 -0.09206 -0.01297755 ... -0.2557205 -0.2693453\n", + " 0.05862035]\n", + " [-0.34194735 -0.01383794 -0.06490533 ... -0.11063005 0.16226721\n", + " -0.3197178 ]\n", + " [-0.3646778 0.15443833 0.02241019 ... -0.15093157 -0.09886418\n", + " -0.44295847]\n", + " ...\n", + " [-0.01041886 -0.57636976 -0.03988511 ... -0.2260822 0.49646813\n", + " -0.15528557]\n", + " [-0.19385241 -0.56451964 -0.05551083 ... -0.5638106 0.43611372\n", + " -0.61484563]\n", + " [ 0.1051331 -0.4762463 0.11194798 ... -0.26766616 -0.30734932\n", + " 0.17856634]]\n", + "param grad: rnn.rnn_stacks.2.fw_bn.weight: shape: [1024] stop_grad: False grad: [-0.02791309 -0.992517 0.63012564 ... -1.1830902 1.4646478\n", + " 1.6333911 ]\n", + "param grad: rnn.rnn_stacks.2.fw_bn.bias: shape: [1024] stop_grad: False grad: [-0.10834587 -1.7079136 0.81259465 ... -1.4478713 1.455745\n", + " 2.069446 ]\n", + "param grad: rnn.rnn_stacks.2.fw_bn._mean: shape: [1024] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.2.fw_bn._variance: shape: [1024] stop_grad: True grad: None\n", + "param grad: rnn.rnn_stacks.2.fw_cell.weight_hh: shape: [1024, 1024] stop_grad: False grad: [[-0.14363798 -0.06933184 0.02901152 ... -0.19233373 -0.03206367\n", + " -0.00845779]\n", + " [-0.44314507 -0.8921327 -1.031872 ... -0.558997 -0.53070104\n", + " -0.855925 ]\n", + " [ 0.15673254 0.28793585 0.13351494 ... 0.38433537 0.5040767\n", + " 0.11303265]\n", + " ...\n", + " [-0.22923109 -0.62508404 -0.6195032 ... -0.6876448 -0.41718128\n", + " -0.74844164]\n", + " [ 0.18024652 0.45618314 0.81391454 ... 0.5780604 0.87566674\n", + " 0.71526295]\n", + " [ 0.3763076 0.54033077 0.9940485 ... 1.087821 0.72288674\n", + " 1.2852117 ]]\n", + "param grad: rnn.rnn_stacks.2.fw_cell.bias_hh: shape: [1024] stop_grad: False grad: [-0.10834593 -1.7079139 0.8125948 ... -1.4478711 1.4557447\n", + " 2.0694466 ]\n", + "param grad: rnn.rnn_stacks.2.bw_cell.weight_hh: shape: [1024, 1024] stop_grad: False grad: None\n", + "param grad: rnn.rnn_stacks.2.bw_cell.bias_hh: shape: [1024] stop_grad: False grad: None\n", + "param grad: fc.weight: shape: [2048, 4299] stop_grad: False grad: [[ 1.4382483e-02 2.0160766e-02 1.2322801e-02 ... 1.0075266e-02\n", + " 7.4421698e-03 -2.3925617e+01]\n", + " [ 3.7887424e-02 5.7105277e-02 2.8803380e-02 ... 2.4820438e-02\n", + " 1.8560058e-02 -5.0687141e+01]\n", + " [ 4.5566272e-02 5.4415584e-02 3.2858539e-02 ... 3.2725763e-02\n", + " 2.1536341e-02 -6.1036335e+01]\n", + " ...\n", + " [ 2.8015019e-02 3.5967816e-02 2.3228688e-02 ... 2.1284629e-02\n", + " 1.3860047e-02 -5.2543671e+01]\n", + " [ 2.8445240e-02 4.2448867e-02 2.7125146e-02 ... 2.2253662e-02\n", + " 1.7470375e-02 -4.3619675e+01]\n", + " [ 4.7438074e-02 5.8287360e-02 3.4546286e-02 ... 3.0827176e-02\n", + " 2.2168703e-02 -6.7901680e+01]]\n", + "param grad: fc.bias: shape: [4299] stop_grad: False grad: [ 8.8967547e-02 1.0697905e-01 6.5251388e-02 ... 6.1503030e-02\n", + " 4.3404289e-02 -1.3512518e+02]\n" + ] + } + ], + "source": [ + "loss.backward(retain_graph=False)\n", + "for n, p in dp_model.named_parameters():\n", + " print(\n", + " f\"param grad: {n}: shape: {p.shape} stop_grad: {p.stop_gradient} grad: {p.grad}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "selected-crazy", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1.]\n" + ] + } + ], + "source": [ + "print(loss.grad)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bottom-engineer", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "stuffed-yeast", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7aad026c5..072225ccb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,4 +38,4 @@ entry: python .pre-commit-hooks/copyright-check.hook language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ - exclude: (?=decoders/swig).*(\.cpp|\.h)$ \ No newline at end of file + #exclude: (?=decoders/swig).*(\.cpp|\.h)$ diff --git a/.travis.yml b/.travis.yml index 6ca50d954..d1f4abb50 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,7 @@ language: cpp cache: ccache sudo: required -dist: xenial +dist: Bionic services: - docker os: @@ -26,7 +26,7 @@ script: - exit_code=0 - .travis/precommit.sh || exit_code=$(( exit_code | $? )) - docker run -i --rm -v "$PWD:/py_unittest" paddlepaddle/paddle:latest /bin/bash -c - 'cd /py_unittest; sh .travis/unittest.sh' || exit_code=$(( exit_code | $? )) + 'cd /py_unittest; source env.sh; bash .travis/unittest.sh' || exit_code=$(( exit_code | $? )) exit $exit_code notifications: diff --git a/.travis/unittest.sh b/.travis/unittest.sh index e42764368..2eb57477f 100755 --- a/.travis/unittest.sh +++ b/.travis/unittest.sh @@ -15,7 +15,7 @@ unittest(){ if [ $? != 0 ]; then exit 1 fi - find . -name 'tests' -type d -print0 | \ + find . -path ./tools/venv -prune -false -o -name 'tests' -type d -print0 | \ xargs -0 -I{} -n1 bash -c \ 'python3 -m unittest discover -v -s {}' cd - > /dev/null @@ -24,6 +24,10 @@ unittest(){ trap 'abort' 0 set -e +cd tools; make; cd - +. tools/venv/bin/activate +pip3 install pytest + unittest . trap : 0 diff --git a/README.md b/README.md index 7d99ef99b..ed04d2415 100644 --- a/README.md +++ b/README.md @@ -1,475 +1,37 @@ -# DeepSpeech2 on PaddlePaddle +# DeepSpeech on PaddlePaddle [中文版](README_cn.md) -*DeepSpeech2 on PaddlePaddle* is an open-source implementation of end-to-end Automatic Speech Recognition (ASR) engine, based on [Baidu's Deep Speech 2 paper](http://proceedings.mlr.press/v48/amodei16.pdf), with [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform. Our vision is to empower both industrial application and academic research on speech recognition, via an easy-to-use, efficient and scalable implementation, including training, inference & testing module, and demo deployment. Besides, several pre-trained models for both English and Mandarin are also released. +*DeepSpeech on PaddlePaddle* is an open-source implementation of end-to-end Automatic Speech Recognition (ASR) engine, with [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform. Our vision is to empower both industrial application and academic research on speech recognition, via an easy-to-use, efficient and scalable implementation, including training, inference & testing module, and demo deployment. -## Table of Contents -- [Installation](#installation) -- [Getting Started](#getting-started) -- [Data Preparation](#data-preparation) -- [Training a Model](#training-a-model) -- [Inference and Evaluation](#inference-and-evaluation) -- [Hyper-parameters Tuning](#hyper-parameters-tuning) -- [Trying Live Demo with Your Own Voice](#trying-live-demo-with-your-own-voice) -- [Experiments and Benchmarks](#experiments-and-benchmarks) -- [Released Models](#released-models) -- [Questions and Help](#questions-and-help) +For more information, please docs under `doc`. +## Models +* [Baidu's Deep Speech2](http://proceedings.mlr.press/v48/amodei16.pdf) -## Installation - -To avoid the trouble of environment setup, [running in Docker container](#running-in-docker-container) is highly recommended. Otherwise follow the guidelines below to install the dependencies manually. - -### Prerequisites -- Python >= 3.6 -- PaddlePaddle 1.8.0 or later (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)) - -### Setup -- Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost` and `swig`, e.g. installing them via `apt-get`: - -```bash -sudo apt-get install -y pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev -``` - -or, installing them via `yum`: - -```bash -sudo yum install pkgconfig libogg-devel libvorbis-devel boost-devel python3-devel -wget https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.1.tar.xz -xz -d flac-1.3.1.tar.xz -tar -xvf flac-1.3.1.tar -cd flac-1.3.1 -./configure -make -make install -``` +## Setup +* python3.7 +* paddlepaddle 2.0.0 - Run the setup script for the remaining dependencies ```bash git clone https://github.com/PaddlePaddle/DeepSpeech.git cd DeepSpeech -sh setup.sh +pushd tools; make; popd +source tools/venv/bin/activate +bash setup.sh ``` -### Running in Docker Container - -Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed, including the pre-built PaddlePaddle, CTC decoders, and other necessary Python and third-party packages. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed. - -Take several steps to launch the Docker image: - -- Download the Docker image +- Source venv before do experiment. ```bash -nvidia-docker pull hub.baidubce.com/paddlepaddle/deep_speech_fluid:latest-gpu -``` - -- Clone this repository - -``` -git clone https://github.com/PaddlePaddle/DeepSpeech.git -``` - -- Run the Docker image - -```bash -sudo nvidia-docker run -it -v $(pwd)/DeepSpeech:/DeepSpeech hub.baidubce.com/paddlepaddle/deep_speech_fluid:latest-gpu /bin/bash -``` -Now go back and start from the [Getting Started](#getting-started) section, you can execute training, inference and hyper-parameters tuning similarly in the Docker container. - - -- Install PaddlePaddle - -For example, for CUDA 10.1, CuDNN7.5: -```bash -python3 -m pip install paddlepaddle-gpu==1.8.0.post107 +source tools/venv/bin/activate ``` ## Getting Started -Several shell scripts provided in `./examples` will help us to quickly give it a try, for most major modules, including data preparation, model training, case inference and model evaluation, with a few public dataset (e.g. [LibriSpeech](http://www.openslr.org/12/), [Aishell](http://www.openslr.org/33)). Reading these examples will also help you to understand how to make it work with your own data. - -Some of the scripts in `./examples` are configured with 8 GPUs. If you don't have 8 GPUs available, please modify `CUDA_VISIBLE_DEVICES`. If you don't have any GPU available, please set `--use_gpu` to False to use CPUs instead. Besides, if out-of-memory problem occurs, just reduce `--batch_size` to fit. - -Let's take a tiny sampled subset of [LibriSpeech dataset](http://www.openslr.org/12/) for instance. - -- Go to directory - - ```bash - cd examples/tiny - ``` - - Notice that this is only a toy example with a tiny sampled subset of LibriSpeech. If you would like to try with the complete dataset (would take several days for training), please go to `examples/librispeech` instead. -- Source env - - ```bash - source path.sh - ``` - Set `MAIN_ROOT` as project dir. -- Main entrypoint - - ```bash - bash run.sh - ``` - -More detailed information are provided in the following sections. Wish you a happy journey with the *DeepSpeech2 on PaddlePaddle* ASR engine! - - -## Data Preparation - -### Generate Manifest - -*DeepSpeech2 on PaddlePaddle* accepts a textual **manifest** file as its data set interface. A manifest file summarizes a set of speech data, with each line containing some meta data (e.g. filepath, transcription, duration) of one audio clip, in [JSON](http://www.json.org/) format, such as: - -``` -{"audio_filepath": "/home/work/.cache/paddle/Libri/134686/1089-134686-0001.flac", "duration": 3.275, "text": "stuff it into you his belly counselled him"} -{"audio_filepath": "/home/work/.cache/paddle/Libri/134686/1089-134686-0007.flac", "duration": 4.275, "text": "a cold lucid indifference reigned in his soul"} -``` - -To use your custom data, you only need to generate such manifest files to summarize the dataset. Given such summarized manifests, training, inference and all other modules can be aware of where to access the audio files, as well as their meta data including the transcription labels. - -For how to generate such manifest files, please refer to `examples/librispeech/local/librispeech.py`, which will download data and generate manifest files for LibriSpeech dataset. - -### Compute Mean & Stddev for Normalizer - -To perform z-score normalization (zero-mean, unit stddev) upon audio features, we have to estimate in advance the mean and standard deviation of the features, with some training samples: - -```bash -python3 tools/compute_mean_std.py \ ---num_samples 2000 \ ---specgram_type linear \ ---manifest_path examples/librispeech/data/manifest.train \ ---output_path examples/librispeech/data/mean_std.npz -``` - -It will compute the mean and standard deviatio of power spectrum feature with 2000 random sampled audio clips listed in `examples/librispeech/data/manifest.train` and save the results to `examples/librispeech/data/mean_std.npz` for further usage. - - -### Build Vocabulary - -A vocabulary of possible characters is required to convert the transcription into a list of token indices for training, and in decoding, to convert from a list of indices back to text again. Such a character-based vocabulary can be built with `tools/build_vocab.py`. - -```bash -python3 tools/build_vocab.py \ ---count_threshold 0 \ ---vocab_path examples/librispeech/data/eng_vocab.txt \ ---manifest_paths examples/librispeech/data/manifest.train -``` - -It will write a vocabuary file `examples/librispeech/data/eng_vocab.txt` with all transcription text in `examples/librispeech/data/manifest.train`, without vocabulary truncation (`--count_threshold 0`). - -### More Help - -For more help on arguments: - -```bash -python3 examples/librispeech/local/librispeech.py --help -python3 tools/compute_mean_std.py --help -python3 tools/build_vocab.py --help -``` - -## Training a model - -`train.py` is the main caller of the training module. Examples of usage are shown below. - -- Start training from scratch with 8 GPUs: - - ``` - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train.py - ``` - -- Start training from scratch with CPUs: - - ``` - python3 train.py --use_gpu False - ``` -- Resume training from a checkpoint: - - ``` - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ - python3 train.py \ - --init_from_pretrained_model CHECKPOINT_PATH_TO_RESUME_FROM - ``` - -For more help on arguments: - -```bash -python3 train.py --help -``` -or refer to `example/librispeech/local/run_train.sh`. - - -### Data Augmentation Pipeline - -Data augmentation has often been a highly effective technique to boost the deep learning performance. We augment our speech data by synthesizing new audios with small random perturbation (label-invariant transformation) added upon raw audios. You don't have to do the syntheses on your own, as it is already embedded into the data provider and is done on the fly, randomly for each epoch during training. - -Six optional augmentation components are provided to be selected, configured and inserted into the processing pipeline. - - - Volume Perturbation - - Speed Perturbation - - Shifting Perturbation - - Online Bayesian normalization - - Noise Perturbation (need background noise audio files) - - Impulse Response (need impulse audio files) - -In order to inform the trainer of what augmentation components are needed and what their processing orders are, it is required to prepare in advance an *augmentation configuration file* in [JSON](http://www.json.org/) format. For example: - -``` -[{ - "type": "speed", - "params": {"min_speed_rate": 0.95, - "max_speed_rate": 1.05}, - "prob": 0.6 -}, -{ - "type": "shift", - "params": {"min_shift_ms": -5, - "max_shift_ms": 5}, - "prob": 0.8 -}] -``` - -When the `--augment_conf_file` argument of `trainer.py` is set to the path of the above example configuration file, every audio clip in every epoch will be processed: with 60% of chance, it will first be speed perturbed with a uniformly random sampled speed-rate between 0.95 and 1.05, and then with 80% of chance it will be shifted in time with a random sampled offset between -5 ms and 5 ms. Finally this newly synthesized audio clip will be feed into the feature extractor for further training. - -For other configuration examples, please refer to `conf/augmenatation.config.example`. - -Be careful when utilizing the data augmentation technique, as improper augmentation will do harm to the training, due to the enlarged train-test gap. - - -### Training for Mandarin Language - -The key steps of training for Mandarin language are same to that of English language and we have also provided an example for Mandarin training with Aishell in ```examples/aishell/local```. As mentioned above, please execute ```sh run_data.sh```, ```sh run_train.sh```, ```sh run_test.sh``` and ```sh run_infer.sh``` to do data preparation, training, testing and inference correspondingly. We have also prepared a pre-trained model (downloaded by ./models/aishell/download_model.sh) for users to try with ```sh run_infer_golden.sh``` and ```sh run_test_golden.sh```. Notice that, different from English LM, the Mandarin LM is character-based and please run ```tools/tune.py``` to find an optimal setting. - - -## Inference and Evaluation - -### Prepare Language Model - -A language model is required to improve the decoder's performance. We have prepared two language models (with lossy compression) for users to download and try. One is for English and the other is for Mandarin. Users can simply run this to download the preprared language models: - -```bash -cd models/lm -bash download_lm_en.sh -bash download_lm_ch.sh -``` - -If you wish to train your own better language model, please refer to [KenLM](https://github.com/kpu/kenlm) for tutorials. Here we provide some tips to show how we preparing our English and Mandarin language models. You can take it as a reference when you train your own. - -#### English LM - -The English corpus is from the [Common Crawl Repository](http://commoncrawl.org) and you can download it from [statmt](http://data.statmt.org/ngrams/deduped_en). We use part en.00 to train our English language model. There are some preprocessing steps before training: - - * Characters not in \['A-Za-z0-9\s'\] (\s represents whitespace characters) are removed and Arabic numbers are converted to English numbers like 1000 to one thousand. - * Repeated whitespace characters are squeezed to one and the beginning whitespace characters are removed. Notice that all transcriptions are lowercase, so all characters are converted to lowercase. - * Top 400,000 most frequent words are selected to build the vocabulary and the rest are replaced with 'UNKNOWNWORD'. - -Now the preprocessing is done and we get a clean corpus to train the language model. Our released language model are trained with agruments '-o 5 --prune 0 1 1 1 1'. '-o 5' means the max order of language model is 5. '--prune 0 1 1 1 1' represents count thresholds for each order and more specifically it will prune singletons for orders two and higher. To save disk storage we convert the arpa file to 'trie' binary file with arguments '-a 22 -q 8 -b 8'. '-a' represents the maximum number of leading bits of pointers in 'trie' to chop. '-q -b' are quantization parameters for probability and backoff. - -#### Mandarin LM - -Different from the English language model, Mandarin language model is character-based where each token is a Chinese character. We use internal corpus to train the released Mandarin language models. The corpus contain billions of tokens. The preprocessing has tiny difference from English language model and main steps include: - - * The beginning and trailing whitespace characters are removed. - * English punctuations and Chinese punctuations are removed. - * A whitespace character between two tokens is inserted. - -Please notice that the released language models only contain Chinese simplified characters. After preprocessing done we can begin to train the language model. The key training arguments for small LM is '-o 5 --prune 0 1 2 4 4' and '-o 5' for large LM. Please refer above section for the meaning of each argument. We also convert the arpa file to binary file using default settings. - -### Speech-to-text Inference - -An inference module caller `infer.py` is provided to infer, decode and visualize speech-to-text results for several given audio clips. It might help to have an intuitive and qualitative evaluation of the ASR model's performance. - -- Inference with GPU: - - ```bash - CUDA_VISIBLE_DEVICES=0 python3 infer.py - ``` - -- Inference with CPUs: - - ```bash - python3 infer.py --use_gpu False - ``` - -We provide two types of CTC decoders: *CTC greedy decoder* and *CTC beam search decoder*. The *CTC greedy decoder* is an implementation of the simple best-path decoding algorithm, selecting at each timestep the most likely token, thus being greedy and locally optimal. The [*CTC beam search decoder*](https://arxiv.org/abs/1408.2873) otherwise utilizes a heuristic breadth-first graph search for reaching a near global optimality; it also requires a pre-trained KenLM language model for better scoring and ranking. The decoder type can be set with argument `--decoding_method`. - -For more help on arguments: - -``` -python3 infer.py --help -``` -or refer to `example/librispeech/local/run_infer.sh`. - -### Evaluate a Model - -To evaluate a model's performance quantitatively, please run: - -- Evaluation with GPUs: - - ```bash - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 test.py - ``` - -- Evaluation with CPUs: - - ```bash - python3 test.py --use_gpu False - ``` - -The error rate (default: word error rate; can be set with `--error_rate_type`) will be printed. - -For more help on arguments: - -```bash -python3 test.py --help -``` -or refer to `example/librispeech/local/run_test.sh`. - -## Hyper-parameters Tuning - -The hyper-parameters $\alpha$ (language model weight) and $\beta$ (word insertion weight) for the [*CTC beam search decoder*](https://arxiv.org/abs/1408.2873) often have a significant impact on the decoder's performance. It would be better to re-tune them on the validation set when the acoustic model is renewed. - -`tools/tune.py` performs a 2-D grid search over the hyper-parameter $\alpha$ and $\beta$. You must provide the range of $\alpha$ and $\beta$, as well as the number of their attempts. - -- Tuning with GPU: - - ```bash - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ - python3 tools/tune.py \ - --alpha_from 1.0 \ - --alpha_to 3.2 \ - --num_alphas 45 \ - --beta_from 0.1 \ - --beta_to 0.45 \ - --num_betas 8 - ``` - -- Tuning with CPU: - - ```bash - python3 tools/tune.py --use_gpu False - ``` - The grid search will print the WER (word error rate) or CER (character error rate) at each point in the hyper-parameters space, and draw the error surface optionally. A proper hyper-parameters range should include the global minima of the error surface for WER/CER, as illustrated in the following figure. - -

- -
An example error surface for tuning on the dev-clean set of LibriSpeech -

- -Usually, as the figure shows, the variation of language model weight ($\alpha$) significantly affect the performance of CTC beam search decoder. And a better procedure is to first tune on serveral data batches (the number can be specified) to find out the proper range of hyper-parameters, then change to the whole validation set to carray out an accurate tuning. - -After tuning, you can reset $\alpha$ and $\beta$ in the inference and evaluation modules to see if they really help improve the ASR performance. For more help - -```bash -python3 tune.py --help -``` -or refer to `example/librispeech/local/run_tune.sh`. - - -## Trying Live Demo with Your Own Voice - -Until now, an ASR model is trained and tested qualitatively (`infer.py`) and quantitatively (`test.py`) with existing audio files. But it is not yet tested with your own speech. `deploy/demo_english_server.py` and `deploy/demo_client.py` helps quickly build up a real-time demo ASR engine with the trained model, enabling you to test and play around with the demo, with your own voice. - -To start the demo's server, please run this in one console: - -```bash -CUDA_VISIBLE_DEVICES=0 \ -python3 deploy/demo_server.py \ ---host_ip localhost \ ---host_port 8086 -``` - -For the machine (might not be the same machine) to run the demo's client, please do the following installation before moving on. - -For example, on MAC OS X: - -```bash -brew install portaudio -pip install pyaudio -pip install keyboard -``` - -Then to start the client, please run this in another console: - -```bash -CUDA_VISIBLE_DEVICES=0 \ -python3 -u deploy/demo_client.py \ ---host_ip 'localhost' \ ---host_port 8086 -``` - -Now, in the client console, press the `whitespace` key, hold, and start speaking. Until finishing your utterance, release the key to let the speech-to-text results shown in the console. To quit the client, just press `ESC` key. - -Notice that `deploy/demo_client.py` must be run on a machine with a microphone device, while `deploy/demo_server.py` could be run on one without any audio recording hardware, e.g. any remote server machine. Just be careful to set the `host_ip` and `host_port` argument with the actual accessible IP address and port, if the server and client are running with two separate machines. Nothing should be done if they are running on one single machine. - -Please also refer to `examples/deploy_demo/run_english_demo_server.sh`, which will first download a pre-trained English model (trained with 3000 hours of internal speech data) and then start the demo server with the model. With running `examples/deploy_demo/run_demo_client.sh`, you can speak English to test it. If you would like to try some other models, just update `--model_path` argument in the script.   - -For more help on arguments: - -```bash -python3 deploy/demo_server.py --help -python3 deploy/demo_client.py --help -``` - - -## Experiments and Benchmarks - -#### Benchmark Results for English Models (Word Error Rate) - -Test Set | LibriSpeech Model | BaiduEN8K Model -:--------------------- | ---------------: | -------------------: -LibriSpeech Test-Clean | 6.85 | 5.41 -LibriSpeech Test-Other | 21.18 | 13.85 -VoxForge American-Canadian | 12.12 |   7.13 -VoxForge Commonwealth | 19.82 | 14.93 -VoxForge European | 30.15 | 18.64 -VoxForge Indian | 53.73 | 25.51 -Baidu Internal Testset  |   40.75 |   8.48 - -For reproducing benchmark results on VoxForge data, we provide a script to download data and generate VoxForge dialect manifest files. Please go to ```data/voxforge``` and execute ```sh run_data.sh``` to get VoxForge dialect manifest files. Notice that VoxForge data may keep updating and the generated manifest files may have difference from those we evaluated on. - -#### Benchmark Results for Mandarin Model (Character Error Rate) - -Test Set | BaiduCN1.2k Model -:--------------------- | -------------------: -Baidu Internal Testset | 12.64 - -#### Acceleration with Multi-GPUs - -We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of LibriSpeech samples whose audio durations are between 6.0 and 7.0 seconds). And it shows that a **near-linear** acceleration with multiple GPUs has been achieved. In the following figure, the time (in seconds) cost for training is printed on the blue bars. - -
- -| # of GPU | Acceleration Rate | -| -------- | --------------: | -| 1 | 1.00 X | -| 2 | 1.98 X | -| 4 | 3.73 X | -| 8 | 6.95 X | - -`tools/profile.sh` provides such a profiling tool. - - -## Released Models - -#### Speech Model Released - -Language | Model Name | Training Data | Hours of Speech -:-----------: | :------------: | :----------: | -------: -English | [LibriSpeech Model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz) | [LibriSpeech Dataset](http://www.openslr.org/12/) | 960 h -English | [BaiduEN8k Model](https://deepspeech.bj.bcebos.com/demo_models/baidu_en8k_model_fluid.tar.gz) | Baidu Internal English Dataset | 8628 h -Mandarin | [Aishell Model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_fluid.tar.gz) | [Aishell Dataset](http://www.openslr.org/33/) | 151 h -Mandarin | [BaiduCN1.2k Model](https://deepspeech.bj.bcebos.com/demo_models/baidu_cn1.2k_model_fluid.tar.gz) | Baidu Internal Mandarin Dataset | 1204 h - -#### Language Model Released - -Language Model | Training Data | Token-based | Size | Descriptions -:-------------:| :------------:| :-----: | -----: | :----------------- -[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1;
About 1.85 billion n-grams;
'trie' binary with '-a 22 -q 8 -b 8' -[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4;
About 0.13 billion n-grams;
'probing' binary with default settings -[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning;
About 3.7 billion n-grams;
'probing' binary with default settings - +Please see [Getting Started](docs/geting_started.md) and [tiny egs](examples/tiny/README.md). ## Questions and Help diff --git a/README_cn.md b/README_cn.md index 94825cb96..d8dd0db6f 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,479 +1,37 @@ -# 语音识别: DeepSpeech2 +# DeepSpeech on PaddlePaddle [English](README.md) -*DeepSpeech2*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别(ASR)引擎的开源项目,具体原理参考这篇论文[Baidu's Deep Speech 2 paper](http://proceedings.mlr.press/v48/amodei16.pdf)。 +*DeepSpeech on PaddlePaddle*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别(ASR)引擎的开源项目, 我们的愿景是为语音识别在工业应用和学术研究上,提供易于使用、高效和可扩展的工具,包括训练,推理,测试模块,以及 demo 部署。同时,我们还将发布一些预训练好的英语和普通话模型。 -## 目录 -- [安装](#安装) -- [开始](#开始) -- [数据准备](#数据准备) -- [训练模型](#训练模型) -- [推断和评价](#推断和评价) -- [超参数调整](#超参数调整) -- [用自己的声音尝试现场演示](#用自己的声音尝试现场演示) -- [试验和基准](#试验和基准) -- [发布模型](#发布模型) -- [问题和帮助](#问题和帮助) -## 安装 -为了避免环境配置问题,强烈建议在[Docker容器上运行](#在Docker容器上运行),否则请按照下面的指南安装依赖项。 - -### 前提 -- Python >= 3.6 -- PaddlePaddle 1.8.0 版本及以上(请参考[安装指南](https://www.paddlepaddle.org.cn/install/quick)) - -### 安装 -- 请确保以下库或工具已安装完毕:`pkg-config`, `flac`, `ogg`, `vorbis`, `boost` 和 `swig`, 如可以通过`apt-get`安装: - -```bash -sudo apt-get install -y pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev -``` +## 模型 +* [Baidu's Deep Speech2](http://proceedings.mlr.press/v48/amodei16.pdf) -或者,也可以通过`yum`安装: - -```bash -sudo yum install pkgconfig libogg-devel libvorbis-devel boost-devel python3-devel -wget https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.1.tar.xz -xz -d flac-1.3.1.tar.xz -tar -xvf flac-1.3.1.tar -cd flac-1.3.1 -./configure -make -make install -``` +## 安装 +* python3.7 +* paddlepaddle 2.0.0 -- 运行脚本安装其余的依赖项 +- 安装依赖 ```bash git clone https://github.com/PaddlePaddle/DeepSpeech.git cd DeepSpeech -sh setup.sh -``` - -### 在Docker容器上运行 - -Docker 是一个开源工具,用于在孤立的环境中构建、发布和运行分布式应用程序。此项目的 Docker 镜像已在[hub.docker.com](https://hub.docker.com)中提供,并安装了所有依赖项,其中包括预先构建的PaddlePaddle,CTC解码器以及其他必要的 Python 和第三方库。这个 Docker 映像需要NVIDIA GPU的支持,所以请确保它的可用性并已完成[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)的安装。 - -采取以下步骤来启动 Docker 镜像: - -- 下载 Docker 镜像 - -```bash -nvidia-docker pull hub.baidubce.com/paddlepaddle/deep_speech_fluid:latest-gpu -``` - -- git clone 这个资源库 - -``` -git clone https://github.com/PaddlePaddle/DeepSpeech.git +pushd tools; make; popd +source tools/venv/bin/activate +bash setup.sh ``` -- 运行 Docker 镜像 +- 开始实验前要source环境. ```bash -sudo nvidia-docker run -it -v $(pwd)/DeepSpeech:/DeepSpeech hub.baidubce.com/paddlepaddle/deep_speech_fluid:latest-gpu /bin/bash -``` - -现在返回并从[开始](#开始)部分开始,您可以在Docker容器中同样执行模型训练,推断和超参数调整。 - -- 安装 PaddlePaddle - -例如 CUDA 10.1, CuDNN7.5: -```bash -python3 -m pip install paddlepaddle-gpu==1.8.0.post107 +source tools/venv/bin/activate ``` ## 开始 -`./examples`里的一些 shell 脚本将帮助我们在一些公开数据集(比如:[LibriSpeech](http://www.openslr.org/12/), [Aishell](http://www.openslr.org/33)) 进行快速尝试,包括了数据准备,模型训练,案例推断和模型评价。阅读这些例子将帮助你理解如何使用你的数据集训练模型。 - -`./examples`目录中的一些脚本配置使用了 8 个 GPU。如果你没有 8 个可用的 GPU,请修改环境变量`CUDA_VISIBLE_DEVICES`。如果你没有可用的 GPU,请设置`--use_gpu`为 False,这样程序会用 CPU 代替 GPU。另外如果发生内存不足的问题,减小`--batch_size`即可。 - -让我们先看看[LibriSpeech dataset](http://www.openslr.org/12/)小样本集的例子。 - -- 进入目录 - - ```bash - cd examples/tiny - ``` - - 注意这仅仅是 LibriSpeech 一个小数据集的例子。如果你想尝试完整的数据集(可能需要花好几天来训练模型),请使用这个路径`examples/librispeech`。 -- 设置环境变量 - - ```bash - source path.sh - ``` -- 入口脚本 - - ```bash - bash run.sh - ``` - -更多细节会在接下来的章节中阐述。祝你在*DeepSpeech2*ASR引擎学习中过得愉快! - - -## 数据准备 - -### 生成Manifest - -*DeepSpeech2*接受文本**manifest**文件作为数据接口。manifest 文件包含了一系列语音数据,其中每一行代表一个[JSON](http://www.json.org/)格式的音频元数据(比如文件路径,描述,时长)。具体格式如下: - -``` -{"audio_filepath": "/home/work/.cache/paddle/Libri/134686/1089-134686-0001.flac", "duration": 3.275, "text": "stuff it into you his belly counselled him"} -{"audio_filepath": "/home/work/.cache/paddle/Libri/134686/1089-134686-0007.flac", "duration": 4.275, "text": "a cold lucid indifference reigned in his soul"} -``` - -如果你要使用自定义数据,你只需要按照以上格式生成自己的 manifest 文件即可。给定 manifest 文件,训练、推断以及其它所有模块都能够访问到音频数据以及对应的时长和标签数据。 - -关于如何生成 manifest 文件,请参考`examples/librispeech/local/librispeech.py`。该脚本将会下载 LibriSpeech 数据集并生成 manifest 文件。 - -### 计算均值和标准差用于归一化 - -为了对音频特征进行 z-score 归一化(零均值,单位标准差),我们必须预估训练样本特征的均值和标准差: - -```bash -python3 tools/compute_mean_std.py \ ---num_samples 2000 \ ---specgram_type linear \ ---manifest_path examples/librispeech/data/manifest.train \ ---output_path examples/librispeech/data/mean_std.npz -``` - -以上这段代码会计算在`examples/librispeech/data/manifest.train`路径中,2000 个随机采样的语音频谱特征的均值和标准差,并将结果保存在`examples/librispeech/data/mean_std.npz`中,方便以后使用。 - -### 建立词表 - -我们需要一个包含可能会出现的字符集合的词表来在训练的时候将字符转换成索引,并在解码的时候将索引转换回文本。`tools/build_vocab.py`脚本将生成这种基于字符的词表。 - -```bash -python3 tools/build_vocab.py \ ---count_threshold 0 \ ---vocab_path examples/librispeech/data/eng_vocab.txt \ ---manifest_paths examples/librispeech/data/manifest.train -``` - -它将`examples/librispeech/data/manifest.train`目录中的所有录音文本写入词表文件`examples/librispeeech/data/eng_vocab.txt`,并且没有词汇截断(`--count_threshold 0`)。 - -### 更多帮助 - -获得更多帮助: - -```bash -python3 examples/librispeech/local/librispeech.py --help -python3 tools/compute_mean_std.py --help -python3 tools/build_vocab.py --help -``` - - - -## 训练模型 - -`train.py`是训练模块的主要调用者。使用示例如下。 - -- 开始使用 8 片 GPU 训练: - - ``` - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train.py - ``` - -- 开始使用 CPU 训练: - - ``` - python3 train.py --use_gpu False - ``` - -- 从检查点恢复训练: - - ``` - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ - python3 train.py \ - --init_from_pretrained_model CHECKPOINT_PATH_TO_RESUME_FROM - ``` - -获得更多帮助: - -```bash -python3 train.py --help -``` -或参考 `example/librispeech/local/run_train.sh`. - - -### 数据增强流水线 - -数据增强是用来提升深度学习性能的非常有效的技术。我们通过在原始音频中添加小的随机扰动(标签不变转换)获得新音频来增强我们的语音数据。你不必自己合成,因为数据增强已经嵌入到数据生成器中并且能够即时完成,在训练模型的每个epoch中随机合成音频。 - -目前提供六个可选的增强组件供选择,配置并插入处理过程。 - - - 音量扰动 - - 速度扰动 - - 移动扰动 - - 在线贝叶斯归一化 - - 噪声干扰(需要背景噪音的音频文件) - - 脉冲响应(需要脉冲音频文件) - -为了让训练模块知道需要哪些增强组件以及它们的处理顺序,我们需要事先准备一个[JSON](http://www.json.org/)格式的*扩展配置文件*。例如: - -``` -[{ - "type": "speed", - "params": {"min_speed_rate": 0.95, - "max_speed_rate": 1.05}, - "prob": 0.6 -}, -{ - "type": "shift", - "params": {"min_shift_ms": -5, - "max_shift_ms": 5}, - "prob": 0.8 -}] -``` - -当`trainer.py`的`--augment_conf_file`参数被设置为上述示例配置文件的路径时,每个 epoch 中的每个音频片段都将被处理。首先,均匀随机采样速率会有60%的概率在 0.95 和 1.05 之间对音频片段进行速度扰动。然后,音频片段有 80% 的概率在时间上被挪移,挪移偏差值是 -5 毫秒和 5 毫秒之间的随机采样。最后,这个新合成的音频片段将被传送给特征提取器,以用于接下来的训练。 - -有关其他配置实例,请参考`conf/augmenatation.config.example`. - -使用数据增强技术时要小心,由于扩大了训练和测试集的差异,不恰当的增强会对训练模型不利,导致训练和预测的差距增大。 - -### 训练普通话语言 - -普通话语言训练与英语训练的关键步骤相同,我们提供了一个使用 Aishell 进行普通话训练的例子```examples/aishell```。如上所述,请执行```sh run_data.sh```, ```sh run_train.sh```, ```sh run_test.sh```和```sh run_infer.sh```做相应的数据准备,训练,测试和推断。我们还准备了一个预训练过的模型(执行./models/aishell/download_model.sh下载)供用户使用```run_infer_golden.sh```和```run_test_golden.sh```来。请注意,与英语语言模型不同,普通话语言模型是基于汉字的,请运行```tools/tune.py```来查找最佳设置。 - - - -## 推断和评价 - -### 准备语言模型 - -提升解码器的性能需要准备语言模型。我们准备了两种语言模型(有损压缩)供用户下载和尝试。一个是英语模型,另一个是普通话模型。用户可以执行以下命令来下载已经训练好的语言模型: - -```bash -cd models/lm -bash download_lm_en.sh -bash download_lm_ch.sh -``` - -如果你想训练自己更好的语言模型,请参考[KenLM](https://github.com/kpu/kenlm)获取教程。在这里,我们提供一些技巧来展示我们如何准备我们的英语和普通话模型。当你训练自己的模型的时候,可以参考这些技巧。 - - -#### 英语语言模型 - -英语语料库来自[Common Crawl Repository](http://commoncrawl.org),你可以从[statmt](http://data.statmt.org/ngrams/deduped_en)下载它。我们使用en.00部分来训练我们的英语语言模型。训练前有如下的一些预处理过程: - - * 不在\['A-Za-z0-9\s'\](\s表示空白字符)中的字符将被删除,阿拉伯数字被转换为英文数字,比如“1000”转换为 one thousand。 - * 重复的空白字符被压缩为一个,并且开始的空白字符将被删除。请注意,所有的录音都是小写字母,因此所有字符都转换为小写字母。 - * 选择前 40 万个最常用的单词来建立词表,其余部分将被替换为“UNKNOWNWORD”。 - -现在预处理完成了,我们得到一个干净的语料库来训练语言模型。我们发布的语言模型版本使用了参数“-o 5 --prune 0 1 1 1 1”来训练。“-o 5”表示语言模型的最大order为 5。“--prune 0 1 1 1 1”表示每个 order 的计数阈值,更具体地说,它将第 2 个以及更高的 order 修剪为单个。为了节省磁盘存储空间,我们将使用参数“-a 22 -q 8 -b 8”将 arpa 文件转换为“trie”二进制文件。“-a”表示在“trie”中用于切分的指针的最高位数。“-q -b”是概率和退避的量化参数。 - -#### 普通话语言模型 - -与英语语言模型不同的是,普通话语言模型是基于字符的,其中每一位都是中文汉字。我们使用内部语料库来训练发布的汉语语言模型。该语料库包含数十亿汉字。预处理阶段与英语语言模型有一些小的差别,主要步骤包括: - - * 删除开始和结尾的空白字符。 - * 删除英文标点和中文标点。 - * 在两个字符之间插入空白字符。 - -请注意,发布的语言模型只包含中文简体字。预处理完成后,我们开始训练语言模型。这个小的语言模型训练关键参数是“-o 5 --prune 0 1 2 4 4”,“-o 5”是针对大语言模型。请参考上面的部分了解每个参数的含义。我们还使用默认设置将 arpa 文件转换为二进制文件。 - -### 语音到文本推断 - -推断模块使用`infer.py`进行调用,可以用来推断,解码,以及输出一些给定音频片段可视化到文本的结果。这有助于对ASR模型的性能进行直观和定性的评估。 - -- GPU 版本的推断: - - ```bash - CUDA_VISIBLE_DEVICES=0 python3 infer.py - ``` - -- CPU 版本的推断: - - ```bash - python3 infer.py --use_gpu False - ``` - -我们提供两种类型的 CTC 解码器:*CTC贪心解码器*和*CTC波束搜索解码器*。*CTC贪心解码器*是简单的最佳路径解码算法的实现,在每个时间步选择最可能的字符,因此是贪心的并且是局部最优的。[*CTC波束搜索解码器*](https://arxiv.org/abs/1408.2873)另外使用了启发式广度优先图搜索以达到近似全局最优; 它也需要预先训练的KenLM语言模型以获得更好的评分和排名。解码器类型可以用参数`--decoding_method`设置。 - -获得更多帮助: - -``` -python3 infer.py --help -``` -或参考`example/librispeech/local/run_infer.sh`. - -### 评估模型 - -要定量评估模型的性能,请运行: - -- GPU 版本评估 - - ```bash - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 test.py - ``` - -- CPU 版本评估 - - ```bash - python3 test.py --use_gpu False - ``` - -错误率(默认:误字率;可以用--error_rate_type设置)将被打印出来。 - -获得更多帮助: - -```bash -python3 test.py --help -``` -或参考`example/librispeech/local/run_test.sh`. - - - -## 超参数调整 - -[*CTC波束搜索解码器*](https://arxiv.org/abs/1408.2873)的超参数$\alpha$(语言模型权重)和$\beta$(单词插入权重)对解码器的性能有非常显著的影响。当声学模型更新时,最好在验证集上重新调整它们。 - -`tools/tune.py`会进行2维网格查找超参数$\alpha$和$\beta$。你必须提供$\alpha$和$\beta$的范围,以及尝试的次数。 - -- GPU 版的调整: - - ```bash - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ - python3 tools/tune.py \ - --alpha_from 1.0 \ - --alpha_to 3.2 \ - --num_alphas 45 \ - --beta_from 0.1 \ - --beta_to 0.45 \ - --num_betas 8 - ``` - -- CPU 版的调整: - - ```bash - python3 tools/tune.py --use_gpu False - ``` -网格搜索将会在超参数空间的每个点处打印出 WER (误字率)或者 CER (字符错误率),并且可绘出误差曲面。一个合适的超参数范围应包括 WER/CER 误差表面的全局最小值,如下图所示。 - -

- -
调整LibriSpeech的dev-clean集合的误差曲面示例 -

- -通常,如图所示,语言模型权重($\alpha$)的变化显著影响 CTC波束搜索解码器的性能。更好的方法是首先调整多批数据(可指定数量)以找出适当的超参数范围,然后更改为完整的验证集以进行精确调整。 - -调整之后,您可以在推理和评价模块中重置$\alpha$和$\beta$,以检查它们是否真的有助于提高 ASR 性能。更多帮助如下: - -```bash -python3 tune.py --help -``` -或参考`example/librispeech/local/run_tune.sh`. - - -## 用自己的声音尝试现场演示 - -到目前为止,一个 ASR 模型已经训练完毕,并且用现有的音频文件进行了定性测试(`infer.py`)和定量测试(`test.py`)。但目前还没有用你自己的声音进行测试。`deploy/demo_english_server.py`和`deploy/demo_client.py`能够快速构建一个利用已训练好的模型对ASR引擎进行实时演示的系统,使你能够用自己的语音测试和演示。 - -要启动演示服务,请在控制台中运行: - -```bash -CUDA_VISIBLE_DEVICES=0 \ -python3 deploy/demo_server.py \ ---host_ip localhost \ ---host_port 8086 -``` - -对于运行 demo 客户端的机器(可能不是同一台机器),请在继续之前执行以下安装。 - -比如,对于 MAC OS X 机器: - -```bash -brew install portaudio -pip install pyaudio -pip install keyboard -``` - -然后启动客户端,请在另一个控制台中运行: - -```bash -CUDA_VISIBLE_DEVICES=0 \ -python3 -u deploy/demo_client.py \ ---host_ip 'localhost' \ ---host_port 8086 -``` - -现在,在客户端控制台中,按下`空格`键,按住并开始讲话。讲话完毕请释放该键以让控制台中显示语音的文本结果。要退出客户端,只需按`ESC`键。 - -请注意,`deploy/demo_client.py`必须在带麦克风设备的机器上运行,而`deploy/demo_server.py`可以在没有任何录音硬件的情况下运行,例如任何远程服务器机器。如果服务器和客户端使用两台独立的机器运行,只需要注意将`host_ip`和`host_port`参数设置为实际可访问的IP地址和端口。如果它们在单台机器上运行,则不用作任何处理。 - -请参考`examples/deploy_demo/run_english_demo_server.sh`,它将首先下载一个预先训练过的英语模型(用3000小时的内部语音数据训练),然后用模型启动演示服务器。通过运行`examples/deploy_demo/run_demo_client.sh`,你可以说英语来测试它。如果您想尝试其他模型,只需更新脚本中的`--model_path`参数即可。 - -获得更多帮助: - -```bash -python3 deploy/demo_server.py --help -python3 deploy/demo_client.py --help -``` - - -## 实验和baseline - -#### 英语模型的baseline测试结果(字错误率) - -测试集 | LibriSpeech Model | BaiduEN8K Model -:--------------------- | ---------------: | -------------------: -LibriSpeech Test-Clean | 6.85 | 5.41 -LibriSpeech Test-Other | 21.18 | 13.85 -VoxForge American-Canadian | 12.12 |   7.13 -VoxForge Commonwealth | 19.82 | 14.93 -VoxForge European | 30.15 | 18.64 -VoxForge Indian | 53.73 | 25.51 -Baidu Internal Testset  |   40.75 |   8.48 - -为了在VoxForge数据上重现基准测试结果,我们提供了一个脚本来下载数据并生成VoxForge方言manifest文件。请到```data/voxforge```执行````run_data.sh```来获取VoxForge方言manifest文件。请注意,VoxForge数据可能会持续更新,生成的清单文件可能与我们评估的清单文件有所不同。 - - -#### 普通话模型的baseline测试结果(字符错误率) - -测试集 | BaiduCN1.2k Model -:--------------------- | -------------------: -Baidu Internal Testset | 12.64 - -#### 多GPU加速 - -我们对1,2,4,8个Tesla V100 GPU的训练时间(LibriSpeech样本的子集,其音频持续时间介于6.0和7.0秒之间)进行比较。它表明,已经实现了具有多个GPU的**近线性**加速。在下图中,训练的时间(以秒为单位)显示在蓝色条上。 - -
- -| # of GPU | 加速比 | -| -------- | --------------: | -| 1 | 1.00 X | -| 2 | 1.98 X | -| 4 | 3.73 X | -| 8 | 6.95 X | - -`tools/profile.sh`提供了上述分析工具. - - -## 发布模型 - -#### 语音模型发布 - -语种 | 模型名 | 训练数据 | 语音时长 -:-----------: | :------------: | :----------: | -------: -English | [LibriSpeech Model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz) | [LibriSpeech Dataset](http://www.openslr.org/12/) | 960 h -English | [BaiduEN8k Model](https://deepspeech.bj.bcebos.com/demo_models/baidu_en8k_model_fluid.tar.gz) | Baidu Internal English Dataset | 8628 h -Mandarin | [Aishell Model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_fluid.tar.gz) | [Aishell Dataset](http://www.openslr.org/33/) | 151 h -Mandarin | [BaiduCN1.2k Model](https://deepspeech.bj.bcebos.com/demo_models/baidu_cn1.2k_model_fluid.tar.gz) | Baidu Internal Mandarin Dataset | 1204 h - -#### 语言模型发布 - -语言模型 | 训练数据 | 基于的字符 | 大小 | 描述 -:-------------:| :------------:| :-----: | -----: | :----------------- -[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1;
About 1.85 billion n-grams;
'trie' binary with '-a 22 -q 8 -b 8' -[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4;
About 0.13 billion n-grams;
'probing' binary with default settings -[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning;
About 3.7 billion n-grams;
'probing' binary with default settings - +请查看 [Getting Started](docs/geting_started.md) 和 [tiny egs](examples/tiny/README.md)。 ## 问题和帮助 diff --git a/data_utils/data.py b/data_utils/data.py deleted file mode 100644 index 125768898..000000000 --- a/data_utils/data.py +++ /dev/null @@ -1,381 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Contains data generator for orgnaizing various audio data preprocessing -pipeline and offering data reader interface of PaddlePaddle requirements. -""" - -import random -import tarfile -import multiprocessing -import numpy as np -import paddle.fluid as fluid -from threading import local -from data_utils.utility import read_manifest -from data_utils.augmentor.augmentation import AugmentationPipeline -from data_utils.featurizer.speech_featurizer import SpeechFeaturizer -from data_utils.speech import SpeechSegment -from data_utils.normalizer import FeatureNormalizer - - -class DataGenerator(object): - """ - DataGenerator provides basic audio data preprocessing pipeline, and offers - data reader interfaces of PaddlePaddle requirements. - - :param vocab_filepath: Vocabulary filepath for indexing tokenized - transcripts. - :type vocab_filepath: str - :param mean_std_filepath: File containing the pre-computed mean and stddev. - :type mean_std_filepath: None|str - :param augmentation_config: Augmentation configuration in json string. - Details see AugmentationPipeline.__doc__. - :type augmentation_config: str - :param max_duration: Audio with duration (in seconds) greater than - this will be discarded. - :type max_duration: float - :param min_duration: Audio with duration (in seconds) smaller than - this will be discarded. - :type min_duration: float - :param stride_ms: Striding size (in milliseconds) for generating frames. - :type stride_ms: float - :param window_ms: Window size (in milliseconds) for generating frames. - :type window_ms: float - :param max_freq: Used when specgram_type is 'linear', only FFT bins - corresponding to frequencies between [0, max_freq] are - returned. - :types max_freq: None|float - :param specgram_type: Specgram feature type. Options: 'linear'. - :type specgram_type: str - :param use_dB_normalization: Whether to normalize the audio to -20 dB - before extracting the features. - :type use_dB_normalization: bool - :param random_seed: Random seed. - :type random_seed: int - :param keep_transcription_text: If set to True, transcription text will - be passed forward directly without - converting to index sequence. - :type keep_transcription_text: bool - :param place: The place to run the program. - :type place: CPUPlace or CUDAPlace - :param is_training: If set to True, generate text data for training, - otherwise, generate text data for infer. - :type is_training: bool - """ - - def __init__(self, - vocab_filepath, - mean_std_filepath, - augmentation_config='{}', - max_duration=float('inf'), - min_duration=0.0, - stride_ms=10.0, - window_ms=20.0, - max_freq=None, - specgram_type='linear', - use_dB_normalization=True, - random_seed=0, - keep_transcription_text=False, - place=fluid.CPUPlace(), - is_training=True): - self._max_duration = max_duration - self._min_duration = min_duration - self._normalizer = FeatureNormalizer(mean_std_filepath) - self._augmentation_pipeline = AugmentationPipeline( - augmentation_config=augmentation_config, random_seed=random_seed) - self._speech_featurizer = SpeechFeaturizer( - vocab_filepath=vocab_filepath, - specgram_type=specgram_type, - stride_ms=stride_ms, - window_ms=window_ms, - max_freq=max_freq, - use_dB_normalization=use_dB_normalization) - self._rng = random.Random(random_seed) - self._keep_transcription_text = keep_transcription_text - self._epoch = 0 - self._is_training = is_training - # for caching tar files info - self._local_data = local() - self._local_data.tar2info = {} - self._local_data.tar2object = {} - self._place = place - - def process_utterance(self, audio_file, transcript): - """Load, augment, featurize and normalize for speech data. - - :param audio_file: Filepath or file object of audio file. - :type audio_file: str | file - :param transcript: Transcription text. - :type transcript: str - :return: Tuple of audio feature tensor and data of transcription part, - where transcription part could be token ids or text. - :rtype: tuple of (2darray, list) - """ - if isinstance(audio_file, str) and audio_file.startswith('tar:'): - speech_segment = SpeechSegment.from_file( - self._subfile_from_tar(audio_file), transcript) - else: - speech_segment = SpeechSegment.from_file(audio_file, transcript) - self._augmentation_pipeline.transform_audio(speech_segment) - specgram, transcript_part = self._speech_featurizer.featurize( - speech_segment, self._keep_transcription_text) - specgram = self._normalizer.apply(specgram) - return specgram, transcript_part - - def batch_reader_creator(self, - manifest_path, - batch_size, - padding_to=-1, - flatten=False, - sortagrad=False, - shuffle_method="batch_shuffle"): - """ - Batch data reader creator for audio data. Return a callable generator - function to produce batches of data. - - Audio features within one batch will be padded with zeros to have the - same shape, or a user-defined shape. - - :param manifest_path: Filepath of manifest for audio files. - :type manifest_path: str - :param batch_size: Number of instances in a batch. - :type batch_size: int - :param padding_to: If set -1, the maximun shape in the batch - will be used as the target shape for padding. - Otherwise, `padding_to` will be the target shape. - :type padding_to: int - :param flatten: If set True, audio features will be flatten to 1darray. - :type flatten: bool - :param sortagrad: If set True, sort the instances by audio duration - in the first epoch for speed up training. - :type sortagrad: bool - :param shuffle_method: Shuffle method. Options: - '' or None: no shuffle. - 'instance_shuffle': instance-wise shuffle. - 'batch_shuffle': similarly-sized instances are - put into batches, and then - batch-wise shuffle the batches. - For more details, please see - ``_batch_shuffle.__doc__``. - 'batch_shuffle_clipped': 'batch_shuffle' with - head shift and tail - clipping. For more - details, please see - ``_batch_shuffle``. - If sortagrad is True, shuffle is disabled - for the first epoch. - :type shuffle_method: None|str - :return: Batch reader function, producing batches of data when called. - :rtype: callable - """ - - def batch_reader(): - # read manifest - manifest = read_manifest( - manifest_path=manifest_path, - max_duration=self._max_duration, - min_duration=self._min_duration) - # sort (by duration) or batch-wise shuffle the manifest - if self._epoch == 0 and sortagrad: - manifest.sort(key=lambda x: x["duration"]) - - else: - if shuffle_method == "batch_shuffle": - manifest = self._batch_shuffle( - manifest, batch_size, clipped=False) - elif shuffle_method == "batch_shuffle_clipped": - manifest = self._batch_shuffle( - manifest, batch_size, clipped=True) - elif shuffle_method == "instance_shuffle": - self._rng.shuffle(manifest) - elif shuffle_method == None: - pass - else: - raise ValueError("Unknown shuffle method %s." % - shuffle_method) - # prepare batches - batch = [] - instance_reader = self._instance_reader_creator(manifest) - - for instance in instance_reader(): - batch.append(instance) - if len(batch) == batch_size: - yield self._padding_batch(batch, padding_to, flatten) - batch = [] - if len(batch) >= 1: - yield self._padding_batch(batch, padding_to, flatten) - self._epoch += 1 - - return batch_reader - - @property - def feeding(self): - """Returns data reader's feeding dict. - - :return: Data feeding dict. - :rtype: dict - """ - feeding_dict = {"audio_spectrogram": 0, "transcript_text": 1} - return feeding_dict - - @property - def vocab_size(self): - """Return the vocabulary size. - - :return: Vocabulary size. - :rtype: int - """ - return self._speech_featurizer.vocab_size - - @property - def vocab_list(self): - """Return the vocabulary in list. - - :return: Vocabulary in list. - :rtype: list - """ - return self._speech_featurizer.vocab_list - - def _parse_tar(self, file): - """Parse a tar file to get a tarfile object - and a map containing tarinfoes - """ - result = {} - f = tarfile.open(file) - for tarinfo in f.getmembers(): - result[tarinfo.name] = tarinfo - return f, result - - def _subfile_from_tar(self, file): - """Get subfile object from tar. - - It will return a subfile object from tar file - and cached tar file info for next reading request. - """ - tarpath, filename = file.split(':', 1)[1].split('#', 1) - if 'tar2info' not in self._local_data.__dict__: - self._local_data.tar2info = {} - if 'tar2object' not in self._local_data.__dict__: - self._local_data.tar2object = {} - if tarpath not in self._local_data.tar2info: - object, infoes = self._parse_tar(tarpath) - self._local_data.tar2info[tarpath] = infoes - self._local_data.tar2object[tarpath] = object - return self._local_data.tar2object[tarpath].extractfile( - self._local_data.tar2info[tarpath][filename]) - - def _instance_reader_creator(self, manifest): - """ - Instance reader creator. Create a callable function to produce - instances of data. - - Instance: a tuple of ndarray of audio spectrogram and a list of - token indices for transcript. - """ - - def reader(): - for instance in manifest: - inst = self.process_utterance(instance["audio_filepath"], - instance["text"]) - yield inst - - return reader - - def _padding_batch(self, batch, padding_to=-1, flatten=False): - """ - Padding audio features with zeros to make them have the same shape (or - a user-defined shape) within one bach. - - If ``padding_to`` is -1, the maximun shape in the batch will be used - as the target shape for padding. Otherwise, `padding_to` will be the - target shape (only refers to the second axis). - - If `flatten` is True, features will be flatten to 1darray. - """ - new_batch = [] - # get target shape - max_length = max([audio.shape[1] for audio, text in batch]) - if padding_to != -1: - if padding_to < max_length: - raise ValueError("If padding_to is not -1, it should be larger " - "than any instance's shape in the batch") - max_length = padding_to - # padding - padded_audios = [] - texts, text_lens = [], [] - audio_lens = [] - masks = [] - for audio, text in batch: - padded_audio = np.zeros([audio.shape[0], max_length]) - padded_audio[:, :audio.shape[1]] = audio - if flatten: - padded_audio = padded_audio.flatten() - padded_audios.append(padded_audio) - if self._is_training: - texts += text - else: - texts.append(text) - text_lens.append(len(text)) - audio_lens.append(audio.shape[1]) - mask_shape0 = (audio.shape[0] - 1) // 2 + 1 - mask_shape1 = (audio.shape[1] - 1) // 3 + 1 - mask_max_len = (max_length - 1) // 3 + 1 - mask_ones = np.ones((mask_shape0, mask_shape1)) - mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1)) - mask = np.repeat( - np.reshape( - np.concatenate((mask_ones, mask_zeros), axis=1), - (1, mask_shape0, mask_max_len)), - 32, - axis=0) - masks.append(mask) - padded_audios = np.array(padded_audios).astype('float32') - if self._is_training: - texts = np.expand_dims(np.array(texts).astype('int32'), axis=-1) - texts = fluid.create_lod_tensor( - texts, recursive_seq_lens=[text_lens], place=self._place) - audio_lens = np.array(audio_lens).astype('int64').reshape([-1, 1]) - masks = np.array(masks).astype('float32') - return padded_audios, texts, audio_lens, masks - - def _batch_shuffle(self, manifest, batch_size, clipped=False): - """Put similarly-sized instances into minibatches for better efficiency - and make a batch-wise shuffle. - - 1. Sort the audio clips by duration. - 2. Generate a random number `k`, k in [0, batch_size). - 3. Randomly shift `k` instances in order to create different batches - for different epochs. Create minibatches. - 4. Shuffle the minibatches. - - :param manifest: Manifest contents. List of dict. - :type manifest: list - :param batch_size: Batch size. This size is also used for generate - a random number for batch shuffle. - :type batch_size: int - :param clipped: Whether to clip the heading (small shift) and trailing - (incomplete batch) instances. - :type clipped: bool - :return: Batch shuffled mainifest. - :rtype: list - """ - manifest.sort(key=lambda x: x["duration"]) - shift_len = self._rng.randint(0, batch_size - 1) - batch_manifest = list(zip(*[iter(manifest[shift_len:])] * batch_size)) - self._rng.shuffle(batch_manifest) - batch_manifest = [item for batch in batch_manifest for item in batch] - if not clipped: - res_len = len(manifest) - shift_len - len(batch_manifest) - batch_manifest.extend(manifest[-res_len:]) - batch_manifest.extend(manifest[0:shift_len]) - return batch_manifest diff --git a/decoders/swig/ctc_greedy_decoder.h b/decoders/swig/ctc_greedy_decoder.h deleted file mode 100644 index 5e64f692e..000000000 --- a/decoders/swig/ctc_greedy_decoder.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef CTC_GREEDY_DECODER_H -#define CTC_GREEDY_DECODER_H - -#include -#include - -/* CTC Greedy (Best Path) Decoder - * - * Parameters: - * probs_seq: 2-D vector that each element is a vector of probabilities - * over vocabulary of one time step. - * vocabulary: A vector of vocabulary. - * Return: - * The decoding result in string - */ -std::string ctc_greedy_decoder( - const std::vector>& probs_seq, - const std::vector& vocabulary); - -#endif // CTC_GREEDY_DECODER_H diff --git a/data_utils/__init__.py b/deepspeech/__init__.py similarity index 100% rename from data_utils/__init__.py rename to deepspeech/__init__.py diff --git a/data_utils/augmentor/__init__.py b/deepspeech/decoders/__init__.py similarity index 100% rename from data_utils/augmentor/__init__.py rename to deepspeech/decoders/__init__.py diff --git a/decoders/decoders_deprecated.py b/deepspeech/decoders/decoders_deprecated.py similarity index 100% rename from decoders/decoders_deprecated.py rename to deepspeech/decoders/decoders_deprecated.py diff --git a/decoders/scorer_deprecated.py b/deepspeech/decoders/scorer_deprecated.py similarity index 100% rename from decoders/scorer_deprecated.py rename to deepspeech/decoders/scorer_deprecated.py diff --git a/deepspeech/decoders/swig/.gitignore b/deepspeech/decoders/swig/.gitignore new file mode 100644 index 000000000..0b1046ae8 --- /dev/null +++ b/deepspeech/decoders/swig/.gitignore @@ -0,0 +1,9 @@ +ThreadPool/ +build/ +dist/ +kenlm/ +openfst-1.6.3/ +openfst-1.6.3.tar.gz +swig_decoders.egg-info/ +decoders_wrap.cxx +swig_decoders.py diff --git a/data_utils/featurizer/__init__.py b/deepspeech/decoders/swig/__init__.py similarity index 100% rename from data_utils/featurizer/__init__.py rename to deepspeech/decoders/swig/__init__.py diff --git a/decoders/swig/ctc_beam_search_decoder.cpp b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp similarity index 92% rename from decoders/swig/ctc_beam_search_decoder.cpp rename to deepspeech/decoders/swig/ctc_beam_search_decoder.cpp index 4a63af26a..68aec2090 100644 --- a/decoders/swig/ctc_beam_search_decoder.cpp +++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp @@ -1,3 +1,17 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "ctc_beam_search_decoder.h" #include diff --git a/decoders/swig/ctc_beam_search_decoder.h b/deepspeech/decoders/swig/ctc_beam_search_decoder.h similarity index 78% rename from decoders/swig/ctc_beam_search_decoder.h rename to deepspeech/decoders/swig/ctc_beam_search_decoder.h index 6fdd15517..c31510da3 100644 --- a/decoders/swig/ctc_beam_search_decoder.h +++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.h @@ -1,3 +1,17 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #ifndef CTC_BEAM_SEARCH_DECODER_H_ #define CTC_BEAM_SEARCH_DECODER_H_ diff --git a/decoders/swig/ctc_greedy_decoder.cpp b/deepspeech/decoders/swig/ctc_greedy_decoder.cpp similarity index 68% rename from decoders/swig/ctc_greedy_decoder.cpp rename to deepspeech/decoders/swig/ctc_greedy_decoder.cpp index 03449d739..da028bf83 100644 --- a/decoders/swig/ctc_greedy_decoder.cpp +++ b/deepspeech/decoders/swig/ctc_greedy_decoder.cpp @@ -1,3 +1,17 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "ctc_greedy_decoder.h" #include "decoder_utils.h" diff --git a/deepspeech/decoders/swig/ctc_greedy_decoder.h b/deepspeech/decoders/swig/ctc_greedy_decoder.h new file mode 100644 index 000000000..5e8c5c251 --- /dev/null +++ b/deepspeech/decoders/swig/ctc_greedy_decoder.h @@ -0,0 +1,34 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CTC_GREEDY_DECODER_H +#define CTC_GREEDY_DECODER_H + +#include +#include + +/* CTC Greedy (Best Path) Decoder + * + * Parameters: + * probs_seq: 2-D vector that each element is a vector of probabilities + * over vocabulary of one time step. + * vocabulary: A vector of vocabulary. + * Return: + * The decoding result in string + */ +std::string ctc_greedy_decoder( + const std::vector>& probs_seq, + const std::vector& vocabulary); + +#endif // CTC_GREEDY_DECODER_H diff --git a/decoders/swig/decoder_utils.cpp b/deepspeech/decoders/swig/decoder_utils.cpp similarity index 88% rename from decoders/swig/decoder_utils.cpp rename to deepspeech/decoders/swig/decoder_utils.cpp index 70a159288..a10e07f0c 100644 --- a/decoders/swig/decoder_utils.cpp +++ b/deepspeech/decoders/swig/decoder_utils.cpp @@ -1,3 +1,17 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "decoder_utils.h" #include diff --git a/decoders/swig/decoder_utils.h b/deepspeech/decoders/swig/decoder_utils.h similarity index 83% rename from decoders/swig/decoder_utils.h rename to deepspeech/decoders/swig/decoder_utils.h index 72821c187..827258178 100644 --- a/decoders/swig/decoder_utils.h +++ b/deepspeech/decoders/swig/decoder_utils.h @@ -1,3 +1,17 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #ifndef DECODER_UTILS_H_ #define DECODER_UTILS_H_ diff --git a/decoders/swig/decoders.i b/deepspeech/decoders/swig/decoders.i similarity index 100% rename from decoders/swig/decoders.i rename to deepspeech/decoders/swig/decoders.i diff --git a/decoders/swig/path_trie.cpp b/deepspeech/decoders/swig/path_trie.cpp similarity index 86% rename from decoders/swig/path_trie.cpp rename to deepspeech/decoders/swig/path_trie.cpp index 152efa82c..392e7ca71 100644 --- a/decoders/swig/path_trie.cpp +++ b/deepspeech/decoders/swig/path_trie.cpp @@ -1,3 +1,17 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "path_trie.h" #include diff --git a/decoders/swig/path_trie.h b/deepspeech/decoders/swig/path_trie.h similarity index 72% rename from decoders/swig/path_trie.h rename to deepspeech/decoders/swig/path_trie.h index 7fd715d26..3a5b71b7e 100644 --- a/decoders/swig/path_trie.h +++ b/deepspeech/decoders/swig/path_trie.h @@ -1,3 +1,17 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #ifndef PATH_TRIE_H #define PATH_TRIE_H diff --git a/decoders/swig/scorer.cpp b/deepspeech/decoders/swig/scorer.cpp similarity index 91% rename from decoders/swig/scorer.cpp rename to deepspeech/decoders/swig/scorer.cpp index 27b61cd03..497a289c2 100644 --- a/decoders/swig/scorer.cpp +++ b/deepspeech/decoders/swig/scorer.cpp @@ -1,3 +1,17 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "scorer.h" #include diff --git a/decoders/swig/scorer.h b/deepspeech/decoders/swig/scorer.h similarity index 83% rename from decoders/swig/scorer.h rename to deepspeech/decoders/swig/scorer.h index 5ebc719c7..66c4cb123 100644 --- a/decoders/swig/scorer.h +++ b/deepspeech/decoders/swig/scorer.h @@ -1,3 +1,17 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #ifndef SCORER_H_ #define SCORER_H_ diff --git a/decoders/swig/setup.py b/deepspeech/decoders/swig/setup.py similarity index 96% rename from decoders/swig/setup.py rename to deepspeech/decoders/swig/setup.py index 0fcb24b50..f6dc048da 100644 --- a/decoders/swig/setup.py +++ b/deepspeech/decoders/swig/setup.py @@ -81,9 +81,8 @@ FILES = glob.glob('kenlm/util/*.cc') \ FILES += glob.glob('openfst-1.6.3/src/lib/*.cc') FILES = [ - fn for fn in FILES - if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith( - 'unittest.cc')) + fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc') + or fn.endswith('unittest.cc')) ] LIBS = ['stdc++'] diff --git a/decoders/swig/setup.sh b/deepspeech/decoders/swig/setup.sh similarity index 100% rename from decoders/swig/setup.sh rename to deepspeech/decoders/swig/setup.sh diff --git a/decoders/swig_wrapper.py b/deepspeech/decoders/swig_wrapper.py similarity index 99% rename from decoders/swig_wrapper.py rename to deepspeech/decoders/swig_wrapper.py index 0a0579ad0..3051f4e82 100644 --- a/decoders/swig_wrapper.py +++ b/deepspeech/decoders/swig_wrapper.py @@ -46,7 +46,7 @@ def ctc_greedy_decoder(probs_seq, vocabulary): :rtype: str """ result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary) - return result.decode('utf-8') + return result def ctc_beam_search_decoder(probs_seq, diff --git a/decoders/tests/test_decoders.py b/deepspeech/decoders/tests/test_decoders.py similarity index 98% rename from decoders/tests/test_decoders.py rename to deepspeech/decoders/tests/test_decoders.py index 9c4b1c8eb..e225d2bc1 100644 --- a/decoders/tests/test_decoders.py +++ b/deepspeech/decoders/tests/test_decoders.py @@ -14,7 +14,7 @@ """Test decoders.""" import unittest -from decoders import decoders_deprecated as decoder +from deepspeech.decoders import decoders_deprecated as decoder class TestDecoders(unittest.TestCase): diff --git a/decoders/__init__.py b/deepspeech/exps/__init__.py similarity index 100% rename from decoders/__init__.py rename to deepspeech/exps/__init__.py diff --git a/decoders/swig/__init__.py b/deepspeech/exps/deepspeech2/__init__.py similarity index 100% rename from decoders/swig/__init__.py rename to deepspeech/exps/deepspeech2/__init__.py diff --git a/deploy/demo_client.py b/deepspeech/exps/deepspeech2/bin/deploy/client.py similarity index 83% rename from deploy/demo_client.py rename to deepspeech/exps/deepspeech2/bin/deploy/client.py index b4aa50e8e..766fdc5a9 100644 --- a/deploy/demo_client.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/client.py @@ -19,6 +19,8 @@ import sys import argparse import pyaudio +from deepspeech.utils.socket_server import socket_send + parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--host_ip", @@ -61,16 +63,7 @@ def callback(in_data, frame_count, time_info, status): data_list.append(in_data) enable_trigger_record = False elif len(data_list) > 0: - # Connect to server and send data - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.connect((args.host_ip, args.host_port)) - sent = ''.join(data_list) - sock.sendall(struct.pack('>i', len(sent)) + sent) - print('Speech[length=%d] Sent.' % len(sent)) - # Receive data from the server and shut down - received = sock.recv(1024) - print("Recognition Results: {}".format(received)) - sock.close() + socket_send(args.host_ip, args.host_port, ''.join(data_list)) data_list = [] enable_trigger_record = True return (in_data, pyaudio.paContinue) @@ -80,7 +73,7 @@ def main(): # prepare audio recorder p = pyaudio.PyAudio() stream = p.open( - format=pyaudio.paInt32, + format=pyaudio.paInt16, channels=1, rate=16000, input=True, diff --git a/deepspeech/exps/deepspeech2/bin/deploy/record.py b/deepspeech/exps/deepspeech2/bin/deploy/record.py new file mode 100644 index 000000000..717747593 --- /dev/null +++ b/deepspeech/exps/deepspeech2/bin/deploy/record.py @@ -0,0 +1,54 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Record wav from Microphone""" +# http://people.csail.mit.edu/hubert/pyaudio/ +import pyaudio +import wave + +CHUNK = 1024 +FORMAT = pyaudio.paInt16 +CHANNELS = 1 +RATE = 16000 +RECORD_SECONDS = 5 +WAVE_OUTPUT_FILENAME = "output.wav" + +p = pyaudio.PyAudio() + +stream = p.open( + format=FORMAT, + channels=CHANNELS, + rate=RATE, + input=True, + frames_per_buffer=CHUNK) + +print("* recording") + +frames = [] + +for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): + data = stream.read(CHUNK) + frames.append(data) + +print("* done recording") + +stream.stop_stream() +stream.close() +p.terminate() + +wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb') +wf.setnchannels(CHANNELS) +wf.setsampwidth(p.get_sample_size(FORMAT)) +wf.setframerate(RATE) +wf.writeframes(b''.join(frames)) +wf.close() diff --git a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py new file mode 100644 index 000000000..22dc9ad57 --- /dev/null +++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py @@ -0,0 +1,207 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Server-end for the ASR demo.""" +import os +import time +import argparse +import functools +import paddle +import numpy as np + +from deepspeech.utils.socket_server import warm_up_test +from deepspeech.utils.socket_server import AsrTCPServer +from deepspeech.utils.socket_server import AsrRequestHandler + +from deepspeech.training.cli import default_argument_parser +from deepspeech.exps.deepspeech2.config import get_cfg_defaults + +from deepspeech.frontend.utility import read_manifest +from deepspeech.utils.utility import add_arguments, print_arguments + +from deepspeech.models.deepspeech2 import DeepSpeech2Model +from deepspeech.io.dataset import ManifestDataset + +from paddle.inference import Config +from paddle.inference import create_predictor + + +def init_predictor(args): + if args.model_dir is not None: + config = Config(args.model_dir) + else: + config = Config(args.model_file, args.params_file) + + config.enable_memory_optim() + if args.use_gpu: + config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0) + else: + # If not specific mkldnn, you can set the blas thread. + # The thread num should not be greater than the number of cores in the CPU. + config.set_cpu_math_library_num_threads(4) + config.enable_mkldnn() + + predictor = create_predictor(config) + return predictor + + +def run(predictor, img): + # copy img data to input tensor + input_names = predictor.get_input_names() + for i, name in enumerate(input_names): + input_tensor = predictor.get_input_handle(name) + #input_tensor.reshape(img[i].shape) + #input_tensor.copy_from_cpu(img[i].copy()) + + # do the inference + predictor.run() + + results = [] + # get out data from output tensor + output_names = predictor.get_output_names() + for i, name in enumerate(output_names): + output_tensor = predictor.get_output_handle(name) + output_data = output_tensor.copy_to_cpu() + results.append(output_data) + + return results + + +def inference(config, args): + predictor = init_predictor(args) + + +def start_server(config, args): + """Start the ASR server""" + dataset = ManifestDataset( + config.data.test_manifest, + config.data.vocab_filepath, + config.data.mean_std_filepath, + augmentation_config="{}", + max_duration=config.data.max_duration, + min_duration=config.data.min_duration, + stride_ms=config.data.stride_ms, + window_ms=config.data.window_ms, + n_fft=config.data.n_fft, + max_freq=config.data.max_freq, + target_sample_rate=config.data.target_sample_rate, + specgram_type=config.data.specgram_type, + use_dB_normalization=config.data.use_dB_normalization, + target_dB=config.data.target_dB, + random_seed=config.data.random_seed, + keep_transcription_text=True) + + model = DeepSpeech2Model.from_pretrained(dataset, config, + args.checkpoint_path) + model.eval() + + # prepare ASR inference handler + def file_to_transcript(filename): + feature = dataset.process_utterance(filename, "") + audio = np.array([feature[0]]).astype('float32') #[1, D, T] + audio_len = feature[0].shape[1] + audio_len = np.array([audio_len]).astype('int64') # [1] + + result_transcript = model.decode( + paddle.to_tensor(audio), + paddle.to_tensor(audio_len), + vocab_list=dataset.vocab_list, + decoding_method=config.decoding.decoding_method, + lang_model_path=config.decoding.lang_model_path, + beam_alpha=config.decoding.alpha, + beam_beta=config.decoding.beta, + beam_size=config.decoding.beam_size, + cutoff_prob=config.decoding.cutoff_prob, + cutoff_top_n=config.decoding.cutoff_top_n, + num_processes=config.decoding.num_proc_bsearch) + return result_transcript[0] + + # warming up with utterrances sampled from Librispeech + print('-----------------------------------------------------------') + print('Warming up ...') + warm_up_test( + audio_process_handler=file_to_transcript, + manifest_path=args.warmup_manifest, + num_test_cases=3) + print('-----------------------------------------------------------') + + # start the server + server = AsrTCPServer( + server_address=(args.host_ip, args.host_port), + RequestHandlerClass=AsrRequestHandler, + speech_save_dir=args.speech_save_dir, + audio_process_handler=file_to_transcript) + print("ASR Server Started.") + server.serve_forever() + + +def main(config, args): + start_server(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + add_arg = functools.partial(add_arguments, argparser=parser) + # yapf: disable + add_arg('host_ip', str, + 'localhost', + "Server's IP address.") + add_arg('host_port', int, 8086, "Server's IP port.") + add_arg('speech_save_dir', str, + 'demo_cache', + "Directory to save demo audios.") + add_arg('warmup_manifest', str, None, "Filepath of manifest to warm up.") + add_arg( + "--model_file", + type=str, + default="", + help="Model filename, Specify this when your model is a combined model." + ) + add_arg( + "--params_file", + type=str, + default="", + help= + "Parameter filename, Specify this when your model is a combined model." + ) + add_arg( + "--model_dir", + type=str, + default=None, + help= + "Model dir, If you load a non-combined model, specify the directory of the model." + ) + add_arg("--use_gpu", + type=bool, + default=False, + help="Whether use gpu.") + args = parser.parse_args() + print_arguments(args) + + # https://yaml.org/type/float.html + config = get_cfg_defaults() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + + args.warmup_manifest = config.data.test_manifest + print_arguments(args) + + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + main(config, args) diff --git a/deepspeech/exps/deepspeech2/bin/deploy/send.py b/deepspeech/exps/deepspeech2/bin/deploy/send.py new file mode 100644 index 000000000..84411f91f --- /dev/null +++ b/deepspeech/exps/deepspeech2/bin/deploy/send.py @@ -0,0 +1,52 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Socket client to send wav to ASR server.""" +import struct +import socket +import argparse +import wave + +from deepspeech.utils.socket_server import socket_send + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--host_ip", + default="localhost", + type=str, + help="Server IP address. (default: %(default)s)") +parser.add_argument( + "--host_port", + default=8086, + type=int, + help="Server Port. (default: %(default)s)") +args = parser.parse_args() + +WAVE_OUTPUT_FILENAME = "output.wav" + + +def main(): + wf = wave.open(WAVE_OUTPUT_FILENAME, 'rb') + nframe = wf.getnframes() + data = wf.readframes(nframe) + print(f"Wave: {WAVE_OUTPUT_FILENAME}") + print(f"Wave samples: {nframe}") + print(f"Wave channels: {wf.getnchannels()}") + print(f"Wave sample rate: {wf.getframerate()}") + print(f"Wave sample width: {wf.getsampwidth()}") + assert isinstance(data, bytes) + socket_send(args.host_ip, args.host_port, data) + + +if __name__ == "__main__": + main() diff --git a/deepspeech/exps/deepspeech2/bin/deploy/server.py b/deepspeech/exps/deepspeech2/bin/deploy/server.py new file mode 100644 index 000000000..6b99adc3f --- /dev/null +++ b/deepspeech/exps/deepspeech2/bin/deploy/server.py @@ -0,0 +1,134 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Server-end for the ASR demo.""" +import os +import time +import argparse +import functools +import paddle +import numpy as np + +from deepspeech.utils.socket_server import warm_up_test +from deepspeech.utils.socket_server import AsrTCPServer +from deepspeech.utils.socket_server import AsrRequestHandler + +from deepspeech.training.cli import default_argument_parser +from deepspeech.exps.deepspeech2.config import get_cfg_defaults + +from deepspeech.frontend.utility import read_manifest +from deepspeech.utils.utility import add_arguments, print_arguments + +from deepspeech.models.deepspeech2 import DeepSpeech2Model +from deepspeech.io.dataset import ManifestDataset + + +def start_server(config, args): + """Start the ASR server""" + dataset = ManifestDataset( + config.data.test_manifest, + config.data.vocab_filepath, + config.data.mean_std_filepath, + augmentation_config="{}", + max_duration=config.data.max_duration, + min_duration=config.data.min_duration, + stride_ms=config.data.stride_ms, + window_ms=config.data.window_ms, + n_fft=config.data.n_fft, + max_freq=config.data.max_freq, + target_sample_rate=config.data.target_sample_rate, + specgram_type=config.data.specgram_type, + use_dB_normalization=config.data.use_dB_normalization, + target_dB=config.data.target_dB, + random_seed=config.data.random_seed, + keep_transcription_text=True) + model = DeepSpeech2Model.from_pretrained(dataset, config, + args.checkpoint_path) + model.eval() + + # prepare ASR inference handler + def file_to_transcript(filename): + feature = dataset.process_utterance(filename, "") + audio = np.array([feature[0]]).astype('float32') #[1, D, T] + audio_len = feature[0].shape[1] + audio_len = np.array([audio_len]).astype('int64') # [1] + + result_transcript = model.decode( + paddle.to_tensor(audio), + paddle.to_tensor(audio_len), + vocab_list=dataset.vocab_list, + decoding_method=config.decoding.decoding_method, + lang_model_path=config.decoding.lang_model_path, + beam_alpha=config.decoding.alpha, + beam_beta=config.decoding.beta, + beam_size=config.decoding.beam_size, + cutoff_prob=config.decoding.cutoff_prob, + cutoff_top_n=config.decoding.cutoff_top_n, + num_processes=config.decoding.num_proc_bsearch) + return result_transcript[0] + + # warming up with utterrances sampled from Librispeech + print('-----------------------------------------------------------') + print('Warming up ...') + warm_up_test( + audio_process_handler=file_to_transcript, + manifest_path=args.warmup_manifest, + num_test_cases=3) + print('-----------------------------------------------------------') + + # start the server + server = AsrTCPServer( + server_address=(args.host_ip, args.host_port), + RequestHandlerClass=AsrRequestHandler, + speech_save_dir=args.speech_save_dir, + audio_process_handler=file_to_transcript) + print("ASR Server Started.") + server.serve_forever() + + +def main(config, args): + start_server(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + add_arg = functools.partial(add_arguments, argparser=parser) + # yapf: disable + add_arg('host_ip', str, + 'localhost', + "Server's IP address.") + add_arg('host_port', int, 8086, "Server's IP port.") + add_arg('speech_save_dir', str, + 'demo_cache', + "Directory to save demo audios.") + add_arg('warmup_manifest', str, None, "Filepath of manifest to warm up.") + args = parser.parse_args() + print_arguments(args) + + # https://yaml.org/type/float.html + config = get_cfg_defaults() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + + args.warmup_manifest = config.data.test_manifest + print_arguments(args) + + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + main(config, args) diff --git a/deepspeech/exps/deepspeech2/bin/export.py b/deepspeech/exps/deepspeech2/bin/export.py new file mode 100644 index 000000000..f19060ef0 --- /dev/null +++ b/deepspeech/exps/deepspeech2/bin/export.py @@ -0,0 +1,58 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Export for DeepSpeech2 model.""" + +import io +import logging +import argparse +import functools + +from paddle import distributed as dist + +from deepspeech.training.cli import default_argument_parser +from deepspeech.utils.utility import print_arguments +from deepspeech.utils.error_rate import char_errors, word_errors + +from deepspeech.exps.deepspeech2.config import get_cfg_defaults +from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester as Tester + + +def main_sp(config, args): + exp = Tester(config, args) + exp.setup() + exp.run_export() + + +def main(config, args): + main_sp(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + args = parser.parse_args() + print_arguments(args) + + # https://yaml.org/type/float.html + config = get_cfg_defaults() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + main(config, args) diff --git a/deepspeech/exps/deepspeech2/bin/infer.py b/deepspeech/exps/deepspeech2/bin/infer.py new file mode 100644 index 000000000..6f52c812f --- /dev/null +++ b/deepspeech/exps/deepspeech2/bin/infer.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inferer for DeepSpeech2 model.""" + +import io +import logging +import argparse +import functools + +from paddle import distributed as dist + +from deepspeech.training.cli import default_argument_parser +from deepspeech.utils.utility import print_arguments +from deepspeech.utils.error_rate import char_errors, word_errors + +# TODO(hui zhang): dynamic load +from deepspeech.exps.deepspeech2.config import get_cfg_defaults +from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester as Tester + + +def main_sp(config, args): + exp = Tester(config, args) + exp.setup() + exp.run_test() + + +def main(config, args): + main_sp(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + args = parser.parse_args() + print_arguments(args) + + # https://yaml.org/type/float.html + config = get_cfg_defaults() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + main(config, args) diff --git a/deepspeech/exps/deepspeech2/bin/test.py b/deepspeech/exps/deepspeech2/bin/test.py new file mode 100644 index 000000000..72b38f485 --- /dev/null +++ b/deepspeech/exps/deepspeech2/bin/test.py @@ -0,0 +1,58 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation for DeepSpeech2 model.""" + +import io +import logging +import argparse +import functools + +from paddle import distributed as dist + +from deepspeech.training.cli import default_argument_parser +from deepspeech.utils.utility import print_arguments +from deepspeech.utils.error_rate import char_errors, word_errors + +from deepspeech.exps.deepspeech2.config import get_cfg_defaults +from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester as Tester + + +def main_sp(config, args): + exp = Tester(config, args) + exp.setup() + exp.run_test() + + +def main(config, args): + main_sp(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + args = parser.parse_args() + print_arguments(args) + + # https://yaml.org/type/float.html + config = get_cfg_defaults() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + main(config, args) diff --git a/deepspeech/exps/deepspeech2/bin/train.py b/deepspeech/exps/deepspeech2/bin/train.py new file mode 100644 index 000000000..0c1d08914 --- /dev/null +++ b/deepspeech/exps/deepspeech2/bin/train.py @@ -0,0 +1,60 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Trainer for DeepSpeech2 model.""" + +import io +import logging +import argparse +import functools + +from paddle import distributed as dist + +from deepspeech.utils.utility import print_arguments +from deepspeech.training.cli import default_argument_parser + +from deepspeech.exps.deepspeech2.config import get_cfg_defaults +from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer + + +def main_sp(config, args): + exp = Trainer(config, args) + exp.setup() + exp.run() + + +def main(config, args): + if args.device == "gpu" and args.nprocs > 1: + dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) + else: + main_sp(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + args = parser.parse_args() + print_arguments(args) + + # https://yaml.org/type/float.html + config = get_cfg_defaults() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + main(config, args) diff --git a/deepspeech/exps/deepspeech2/bin/tune.py b/deepspeech/exps/deepspeech2/bin/tune.py new file mode 100644 index 000000000..1fc8dc0c1 --- /dev/null +++ b/deepspeech/exps/deepspeech2/bin/tune.py @@ -0,0 +1,210 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Beam search parameters tuning for DeepSpeech2 model.""" + +import sys +import os +import numpy as np +import argparse +import functools +import gzip +import logging + +from paddle.io import DataLoader + +from deepspeech.utils import error_rate +from deepspeech.utils.utility import add_arguments, print_arguments + +from deepspeech.models.deepspeech2 import DeepSpeech2Model +from deepspeech.io.collator import SpeechCollator +from deepspeech.io.dataset import ManifestDataset + +from deepspeech.training.cli import default_argument_parser +from deepspeech.exps.deepspeech2.config import get_cfg_defaults + + +def tune(config, args): + """Tune parameters alpha and beta incrementally.""" + if not args.num_alphas >= 0: + raise ValueError("num_alphas must be non-negative!") + if not args.num_betas >= 0: + raise ValueError("num_betas must be non-negative!") + + dev_dataset = ManifestDataset( + config.data.dev_manifest, + config.data.vocab_filepath, + config.data.mean_std_filepath, + augmentation_config="{}", + max_duration=config.data.max_duration, + min_duration=config.data.min_duration, + stride_ms=config.data.stride_ms, + window_ms=config.data.window_ms, + n_fft=config.data.n_fft, + max_freq=config.data.max_freq, + target_sample_rate=config.data.target_sample_rate, + specgram_type=config.data.specgram_type, + use_dB_normalization=config.data.use_dB_normalization, + target_dB=config.data.target_dB, + random_seed=config.data.random_seed, + keep_transcription_text=True) + + valid_loader = DataLoader( + dev_dataset, + batch_size=config.data.batch_size, + shuffle=False, + drop_last=False, + collate_fn=SpeechCollator(is_training=False)) + + model = DeepSpeech2Model.from_pretrained(dev_dataset, config, + args.checkpoint_path) + model.eval() + + # decoders only accept string encoded in utf-8 + vocab_list = valid_loader.dataset.vocab_list + errors_func = error_rate.char_errors if config.decoding.error_rate_type == 'cer' else error_rate.word_errors + + # create grid for search + cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) + cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) + params_grid = [(alpha, beta) for alpha in cand_alphas + for beta in cand_betas] + + err_sum = [0.0 for i in range(len(params_grid))] + err_ave = [0.0 for i in range(len(params_grid))] + + num_ins, len_refs, cur_batch = 0, 0, 0 + # initialize external scorer + model.decoder.init_decode(args.alpha_from, args.beta_from, + config.decoding.lang_model_path, vocab_list, + config.decoding.decoding_method) + ## incremental tuning parameters over multiple batches + print("start tuning ...") + for infer_data in valid_loader(): + if (args.num_batches >= 0) and (cur_batch >= args.num_batches): + break + + def ordid2token(texts, texts_len): + """ ord() id to chr() chr """ + trans = [] + for text, n in zip(texts, texts_len): + n = n.numpy().item() + ids = text[:n] + trans.append(''.join([chr(i) for i in ids])) + return trans + + audio, text, audio_len, text_len = infer_data + target_transcripts = ordid2token(text, text_len) + num_ins += audio.shape[0] + + # model infer + eouts, eouts_len = model.encoder(audio, audio_len) + probs = model.decoder.probs(eouts) + + # grid search + for index, (alpha, beta) in enumerate(params_grid): + print(f"tuneing: alpha={alpha} beta={beta}") + result_transcripts = model.decoder.decode_probs( + probs.numpy(), eouts_len, vocab_list, + config.decoding.decoding_method, + config.decoding.lang_model_path, alpha, beta, + config.decoding.beam_size, config.decoding.cutoff_prob, + config.decoding.cutoff_top_n, config.decoding.num_proc_bsearch) + + for target, result in zip(target_transcripts, result_transcripts): + errors, len_ref = errors_func(target, result) + err_sum[index] += errors + + # accumulate the length of references of every batchπ + # in the first iteration + if args.alpha_from == alpha and args.beta_from == beta: + len_refs += len_ref + + err_ave[index] = err_sum[index] / len_refs + if index % 2 == 0: + sys.stdout.write('.') + sys.stdout.flush() + print(f"tuneing: one grid done!") + + # output on-line tuning result at the end of current batch + err_ave_min = min(err_ave) + min_index = err_ave.index(err_ave_min) + print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), " + " min [%s] = %f" % + (cur_batch, num_ins, "%.3f" % params_grid[min_index][0], + "%.3f" % params_grid[min_index][1], + config.decoding.error_rate_type, err_ave_min)) + cur_batch += 1 + + # output WER/CER at every (alpha, beta) + print("\nFinal %s:\n" % config.decoding.error_rate_type) + for index in range(len(params_grid)): + print("(alpha, beta) = (%s, %s), [%s] = %f" % + ("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1], + config.decoding.error_rate_type, err_ave[index])) + + err_ave_min = min(err_ave) + min_index = err_ave.index(err_ave_min) + print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" % + (cur_batch, "%.3f" % params_grid[min_index][0], + "%.3f" % params_grid[min_index][1])) + + print("finish tuning") + + +def main(config, args): + tune(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + add_arg = functools.partial(add_arguments, argparser=parser) + add_arg('num_batches', int, -1, "# of batches tuning on. " + "Default -1, on whole dev set.") + add_arg('num_alphas', int, 45, "# of alpha candidates for tuning.") + add_arg('num_betas', int, 8, "# of beta candidates for tuning.") + add_arg('alpha_from', float, 1.0, "Where alpha starts tuning from.") + add_arg('alpha_to', float, 3.2, "Where alpha ends tuning with.") + add_arg('beta_from', float, 0.1, "Where beta starts tuning from.") + add_arg('beta_to', float, 0.45, "Where beta ends tuning with.") + + add_arg('batch_size', int, 256, "# of samples per batch.") + add_arg('beam_size', int, 500, "Beam search width.") + add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.") + add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.") + add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.") + + args = parser.parse_args() + print_arguments(args) + + # https://yaml.org/type/float.html + config = get_cfg_defaults() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + + config.data.batch_size = args.batch_size + config.decoding.beam_size = args.beam_size + config.decoding.num_proc_bsearch = args.num_proc_bsearch + config.decoding.cutoff_prob = args.cutoff_prob + config.decoding.cutoff_top_n = args.cutoff_top_n + + config.freeze() + print(config) + + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + main(config, args) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py new file mode 100644 index 000000000..968899d75 --- /dev/null +++ b/deepspeech/exps/deepspeech2/config.py @@ -0,0 +1,84 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from yacs.config import CfgNode as CN +from deepspeech.models.deepspeech2 import DeepSpeech2Model + +_C = CN() +_C.data = CN( + dict( + train_manifest="", + dev_manifest="", + test_manifest="", + vocab_filepath="", + mean_std_filepath="", + augmentation_config="", + max_duration=float('inf'), + min_duration=0.0, + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + specgram_type='linear', # 'linear', 'mfcc' + target_sample_rate=16000, # sample rate + use_dB_normalization=True, + target_dB=-20, + random_seed=0, + keep_transcription_text=False, + batch_size=32, # batch size + num_workers=0, # data loader workers + sortagrad=False, # sorted in first epoch when True + shuffle_method="batch_shuffle", # 'batch_shuffle', 'instance_shuffle' + )) + +_C.model = CN( + dict( + num_conv_layers=2, #Number of stacking convolution layers. + num_rnn_layers=3, #Number of stacking RNN layers. + rnn_layer_size=1024, #RNN layer size (number of RNN cells). + use_gru=True, #Use gru if set True. Use simple rnn if set False. + share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. + )) + +DeepSpeech2Model.params(_C.model) + +_C.training = CN( + dict( + lr=5e-4, # learning rate + lr_decay=1.0, # learning rate decay + weight_decay=1e-6, # the coeff of weight decay + global_grad_clip=5.0, # the global norm clip + n_epoch=50, # train epochs + )) + +_C.decoding = CN( + dict( + alpha=2.5, # Coef of LM for beam search. + beta=0.3, # Coef of WC for beam search. + cutoff_prob=1.0, # Cutoff probability for pruning. + cutoff_top_n=40, # Cutoff number for pruning. + lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. + decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy + error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' + num_proc_bsearch=8, # # of CPUs for beam search. + beam_size=500, # Beam search width. + batch_size=128, # decoding batch size + )) + + +def get_cfg_defaults(): + """Get a yacs CfgNode object with default values for my_project.""" + # Return a clone so that the defaults will not be altered + # This is for the "local variable" use pattern + return _C.clone() diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py new file mode 100644 index 000000000..09280cf9f --- /dev/null +++ b/deepspeech/exps/deepspeech2/model.py @@ -0,0 +1,424 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains DeepSpeech2 model.""" + +import io +import sys +import os +import time +import logging +import numpy as np +from collections import defaultdict +from functools import partial +from pathlib import Path + +import paddle +from paddle import distributed as dist +from paddle.io import DataLoader + +from deepspeech.training import Trainer +from deepspeech.training.gradclip import MyClipGradByGlobalNorm + +from deepspeech.utils import mp_tools +from deepspeech.utils import layer_tools +from deepspeech.utils import error_rate + +from deepspeech.io.collator import SpeechCollator +from deepspeech.io.sampler import SortagradDistributedBatchSampler +from deepspeech.io.sampler import SortagradBatchSampler +from deepspeech.io.dataset import ManifestDataset + +from deepspeech.modules.loss import CTCLoss +from deepspeech.models.deepspeech2 import DeepSpeech2Model +from deepspeech.models.deepspeech2 import DeepSpeech2InferModel + +logger = logging.getLogger(__name__) + + +class DeepSpeech2Trainer(Trainer): + def __init__(self, config, args): + super().__init__(config, args) + + def train_batch(self, batch_data): + start = time.time() + self.model.train() + loss = self.model(*batch_data) + loss.backward() + layer_tools.print_grads(self.model, print_func=None) + self.optimizer.step() + self.optimizer.clear_grad() + + iteration_time = time.time() - start + + losses_np = { + 'train_loss': float(loss), + 'train_loss_div_batchsize': + float(loss) / self.config.data.batch_size + } + msg = "Train: Rank: {}, ".format(dist.get_rank()) + msg += "epoch: {}, ".format(self.epoch) + msg += "step: {}, ".format(self.iteration) + msg += "time: {:>.3f}s, ".format(iteration_time) + msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_np.items()) + self.logger.info(msg) + + if dist.get_rank() == 0 and self.visualizer: + for k, v in losses_np.items(): + self.visualizer.add_scalar("train/{}".format(k), v, + self.iteration) + + @mp_tools.rank_zero_only + @paddle.no_grad() + def valid(self): + self.logger.info( + f"Valid Total Examples: {len(self.valid_loader.dataset)}") + self.model.eval() + valid_losses = defaultdict(list) + for i, batch in enumerate(self.valid_loader): + loss = self.model(*batch) + + valid_losses['val_loss'].append(float(loss)) + valid_losses['val_loss_div_batchsize'].append( + float(loss) / self.config.data.batch_size) + + # write visual log + valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} + + # logging + msg = f"Valid: Rank: {dist.get_rank()}, " + msg += "epoch: {}, ".format(self.epoch) + msg += "step: {}, ".format(self.iteration) + msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in valid_losses.items()) + self.logger.info(msg) + + if self.visualizer: + for k, v in valid_losses.items(): + self.visualizer.add_scalar("valid/{}".format(k), v, + self.iteration) + + def setup_model(self): + config = self.config + model = DeepSpeech2Model( + feat_size=self.train_loader.dataset.feature_size, + dict_size=self.train_loader.dataset.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) + + if self.parallel: + model = paddle.DataParallel(model) + + layer_tools.print_params(model, self.logger.info) + + grad_clip = MyClipGradByGlobalNorm(config.training.global_grad_clip) + lr_scheduler = paddle.optimizer.lr.ExponentialDecay( + learning_rate=config.training.lr, + gamma=config.training.lr_decay, + verbose=True) + optimizer = paddle.optimizer.Adam( + learning_rate=lr_scheduler, + parameters=model.parameters(), + weight_decay=paddle.regularizer.L2Decay( + config.training.weight_decay), + grad_clip=grad_clip) + + self.model = model + self.optimizer = optimizer + self.lr_scheduler = lr_scheduler + self.logger.info("Setup model/optimizer/lr_scheduler!") + + def setup_dataloader(self): + config = self.config + + train_dataset = ManifestDataset( + config.data.train_manifest, + config.data.vocab_filepath, + config.data.mean_std_filepath, + augmentation_config=io.open( + config.data.augmentation_config, mode='r', + encoding='utf8').read(), + max_duration=config.data.max_duration, + min_duration=config.data.min_duration, + stride_ms=config.data.stride_ms, + window_ms=config.data.window_ms, + n_fft=config.data.n_fft, + max_freq=config.data.max_freq, + target_sample_rate=config.data.target_sample_rate, + specgram_type=config.data.specgram_type, + use_dB_normalization=config.data.use_dB_normalization, + target_dB=config.data.target_dB, + random_seed=config.data.random_seed, + keep_transcription_text=False) + + dev_dataset = ManifestDataset( + config.data.dev_manifest, + config.data.vocab_filepath, + config.data.mean_std_filepath, + augmentation_config="{}", + max_duration=config.data.max_duration, + min_duration=config.data.min_duration, + stride_ms=config.data.stride_ms, + window_ms=config.data.window_ms, + n_fft=config.data.n_fft, + max_freq=config.data.max_freq, + target_sample_rate=config.data.target_sample_rate, + specgram_type=config.data.specgram_type, + use_dB_normalization=config.data.use_dB_normalization, + target_dB=config.data.target_dB, + random_seed=config.data.random_seed, + keep_transcription_text=False) + + if self.parallel: + batch_sampler = SortagradDistributedBatchSampler( + train_dataset, + batch_size=config.data.batch_size, + num_replicas=None, + rank=None, + shuffle=True, + drop_last=True, + sortagrad=config.data.sortagrad, + shuffle_method=config.data.shuffle_method) + else: + batch_sampler = SortagradBatchSampler( + train_dataset, + shuffle=True, + batch_size=config.data.batch_size, + drop_last=True, + sortagrad=config.data.sortagrad, + shuffle_method=config.data.shuffle_method) + + collate_fn = SpeechCollator(is_training=True) + self.train_loader = DataLoader( + train_dataset, + batch_sampler=batch_sampler, + collate_fn=collate_fn, + num_workers=config.data.num_workers, ) + self.valid_loader = DataLoader( + dev_dataset, + batch_size=config.data.batch_size, + shuffle=False, + drop_last=False, + collate_fn=collate_fn) + self.logger.info("Setup train/valid Dataloader!") + + +class DeepSpeech2Tester(DeepSpeech2Trainer): + def __init__(self, config, args): + super().__init__(config, args) + + def ordid2token(self, texts, texts_len): + """ ord() id to chr() chr """ + trans = [] + for text, n in zip(texts, texts_len): + n = n.numpy().item() + ids = text[:n] + trans.append(''.join([chr(i) for i in ids])) + return trans + + def compute_metrics(self, audio, texts, audio_len, texts_len): + cfg = self.config.decoding + errors_sum, len_refs, num_ins = 0.0, 0, 0 + errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + + vocab_list = self.test_loader.dataset.vocab_list + + target_transcripts = self.ordid2token(texts, texts_len) + result_transcripts = self.model.decode( + audio, + audio_len, + vocab_list, + decoding_method=cfg.decoding_method, + lang_model_path=cfg.lang_model_path, + beam_alpha=cfg.alpha, + beam_beta=cfg.beta, + beam_size=cfg.beam_size, + cutoff_prob=cfg.cutoff_prob, + cutoff_top_n=cfg.cutoff_top_n, + num_processes=cfg.num_proc_bsearch) + + for target, result in zip(target_transcripts, result_transcripts): + errors, len_ref = errors_func(target, result) + errors_sum += errors + len_refs += len_ref + num_ins += 1 + self.logger.info( + "\nTarget Transcription: %s\nOutput Transcription: %s" % + (target, result)) + self.logger.info("Current error rate [%s] = %f" % ( + cfg.error_rate_type, error_rate_func(target, result))) + + return dict( + errors_sum=errors_sum, + len_refs=len_refs, + num_ins=num_ins, + error_rate=errors_sum / len_refs, + error_rate_type=cfg.error_rate_type) + + @mp_tools.rank_zero_only + @paddle.no_grad() + def test(self): + self.logger.info( + f"Test Total Examples: {len(self.test_loader.dataset)}") + self.model.eval() + cfg = self.config + error_rate_type = None + errors_sum, len_refs, num_ins = 0.0, 0, 0 + + for i, batch in enumerate(self.test_loader): + metrics = self.compute_metrics(*batch) + errors_sum += metrics['errors_sum'] + len_refs += metrics['len_refs'] + num_ins += metrics['num_ins'] + error_rate_type = metrics['error_rate_type'] + self.logger.info("Error rate [%s] (%d/?) = %f" % + (error_rate_type, num_ins, errors_sum / len_refs)) + + # logging + msg = "Test: " + msg += "epoch: {}, ".format(self.epoch) + msg += "step: {}, ".format(self.iteration) + msg += ", Final error rate [%s] (%d/%d) = %f" % ( + error_rate_type, num_ins, num_ins, errors_sum / len_refs) + self.logger.info(msg) + + def run_test(self): + self.resume_or_load() + try: + self.test() + except KeyboardInterrupt: + exit(-1) + + def export(self): + self.infer_model.eval() + feat_dim = self.test_loader.dataset.feature_size + paddle.jit.save( + self.infer_model, + self.args.export_path, + input_spec=[ + paddle.static.InputSpec( + shape=[None, feat_dim, None], + dtype='float32'), # audio, [B,D,T] + paddle.static.InputSpec(shape=[None], + dtype='int64'), # audio_length, [B] + ]) + + def run_export(self): + try: + self.export() + except KeyboardInterrupt: + exit(-1) + + def setup(self): + """Setup the experiment. + """ + paddle.set_device(self.args.device) + + self.setup_output_dir() + self.setup_checkpointer() + self.setup_logger() + + self.setup_dataloader() + self.setup_model() + + self.iteration = 0 + self.epoch = 0 + + def setup_model(self): + config = self.config + model = DeepSpeech2Model( + feat_size=self.test_loader.dataset.feature_size, + dict_size=self.test_loader.dataset.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) + + infer_model = DeepSpeech2InferModel.from_pretrained( + self.test_loader.dataset, config, self.args.checkpoint_path) + + self.model = model + self.infer_model = infer_model + self.logger.info("Setup model!") + + def setup_dataloader(self): + config = self.config + # return raw text + test_dataset = ManifestDataset( + config.data.test_manifest, + config.data.vocab_filepath, + config.data.mean_std_filepath, + augmentation_config="{}", + max_duration=config.data.max_duration, + min_duration=config.data.min_duration, + stride_ms=config.data.stride_ms, + window_ms=config.data.window_ms, + n_fft=config.data.n_fft, + max_freq=config.data.max_freq, + target_sample_rate=config.data.target_sample_rate, + specgram_type=config.data.specgram_type, + use_dB_normalization=config.data.use_dB_normalization, + target_dB=config.data.target_dB, + random_seed=config.data.random_seed, + keep_transcription_text=True) + + # return text ord id + self.test_loader = DataLoader( + test_dataset, + batch_size=config.decoding.batch_size, + shuffle=False, + drop_last=False, + collate_fn=SpeechCollator(is_training=False)) + self.logger.info("Setup test Dataloader!") + + def setup_output_dir(self): + """Create a directory used for output. + """ + # output dir + if self.args.output: + output_dir = Path(self.args.output).expanduser() + output_dir.mkdir(parents=True, exist_ok=True) + else: + output_dir = Path( + self.args.checkpoint_path).expanduser().parent.parent + output_dir.mkdir(parents=True, exist_ok=True) + + self.output_dir = output_dir + + def setup_logger(self): + """Initialize a text logger to log the experiment. + + Each process has its own text logger. The logging message is write to + the standard output and a text file named ``worker_n.log`` in the + output directory, where ``n`` means the rank of the process. + """ + format = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s' + formatter = logging.Formatter(fmt=format, datefmt='%Y/%m/%d %H:%M:%S') + + logger.setLevel("INFO") + + # global logger + stdout = True + save_path = "" + logging.basicConfig( + level=logging.DEBUG if stdout else logging.INFO, + format=format, + datefmt='%Y/%m/%d %H:%M:%S', + filename=save_path if not stdout else None) + self.logger = logger diff --git a/model_utils/__init__.py b/deepspeech/frontend/__init__.py similarity index 100% rename from model_utils/__init__.py rename to deepspeech/frontend/__init__.py diff --git a/data_utils/audio.py b/deepspeech/frontend/audio.py similarity index 100% rename from data_utils/audio.py rename to deepspeech/frontend/audio.py diff --git a/utils/__init__.py b/deepspeech/frontend/augmentor/__init__.py similarity index 100% rename from utils/__init__.py rename to deepspeech/frontend/augmentor/__init__.py diff --git a/data_utils/augmentor/augmentation.py b/deepspeech/frontend/augmentor/augmentation.py similarity index 89% rename from data_utils/augmentor/augmentation.py rename to deepspeech/frontend/augmentor/augmentation.py index 349cdc564..e50084a00 100644 --- a/data_utils/augmentor/augmentation.py +++ b/deepspeech/frontend/augmentor/augmentation.py @@ -15,17 +15,17 @@ import json import random -from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor -from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor -from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor -from data_utils.augmentor.noise_perturb import NoisePerturbAugmentor -from data_utils.augmentor.impulse_response import ImpulseResponseAugmentor -from data_utils.augmentor.resample import ResampleAugmentor -from data_utils.augmentor.online_bayesian_normalization import \ +from deepspeech.frontend.augmentor.volume_perturb import VolumePerturbAugmentor +from deepspeech.frontend.augmentor.shift_perturb import ShiftPerturbAugmentor +from deepspeech.frontend.augmentor.speed_perturb import SpeedPerturbAugmentor +from deepspeech.frontend.augmentor.noise_perturb import NoisePerturbAugmentor +from deepspeech.frontend.augmentor.impulse_response import ImpulseResponseAugmentor +from deepspeech.frontend.augmentor.resample import ResampleAugmentor +from deepspeech.frontend.augmentor.online_bayesian_normalization import \ OnlineBayesianNormalizationAugmentor -class AugmentationPipeline(object): +class AugmentationPipeline(): """Build a pre-processing pipeline with various augmentation models.Such a data augmentation pipeline is oftern leveraged to augment the training samples to make the model invariant to certain types of perturbations in the diff --git a/data_utils/augmentor/base.py b/deepspeech/frontend/augmentor/base.py similarity index 98% rename from data_utils/augmentor/base.py rename to deepspeech/frontend/augmentor/base.py index 5b80be2fe..0f7826cdf 100644 --- a/data_utils/augmentor/base.py +++ b/deepspeech/frontend/augmentor/base.py @@ -16,7 +16,7 @@ from abc import ABCMeta, abstractmethod -class AugmentorBase(object): +class AugmentorBase(): """Abstract base class for augmentation model (augmentor) class. All augmentor classes should inherit from this class, and implement the following abstract methods. diff --git a/data_utils/augmentor/impulse_response.py b/deepspeech/frontend/augmentor/impulse_response.py similarity index 90% rename from data_utils/augmentor/impulse_response.py rename to deepspeech/frontend/augmentor/impulse_response.py index 839c6a809..40aa3d47e 100644 --- a/data_utils/augmentor/impulse_response.py +++ b/deepspeech/frontend/augmentor/impulse_response.py @@ -13,9 +13,9 @@ # limitations under the License. """Contains the impulse response augmentation model.""" -from data_utils.augmentor.base import AugmentorBase -from data_utils.utility import read_manifest -from data_utils.audio import AudioSegment +from deepspeech.frontend.augmentor.base import AugmentorBase +from deepspeech.frontend.utility import read_manifest +from deepspeech.frontend.audio import AudioSegment class ImpulseResponseAugmentor(AugmentorBase): diff --git a/data_utils/augmentor/noise_perturb.py b/deepspeech/frontend/augmentor/noise_perturb.py similarity index 93% rename from data_utils/augmentor/noise_perturb.py rename to deepspeech/frontend/augmentor/noise_perturb.py index 954d1b419..350370b8f 100644 --- a/data_utils/augmentor/noise_perturb.py +++ b/deepspeech/frontend/augmentor/noise_perturb.py @@ -13,9 +13,9 @@ # limitations under the License. """Contains the noise perturb augmentation model.""" -from data_utils.augmentor.base import AugmentorBase -from data_utils.utility import read_manifest -from data_utils.audio import AudioSegment +from deepspeech.frontend.augmentor.base import AugmentorBase +from deepspeech.frontend.utility import read_manifest +from deepspeech.frontend.audio import AudioSegment class NoisePerturbAugmentor(AugmentorBase): diff --git a/data_utils/augmentor/online_bayesian_normalization.py b/deepspeech/frontend/augmentor/online_bayesian_normalization.py similarity index 97% rename from data_utils/augmentor/online_bayesian_normalization.py rename to deepspeech/frontend/augmentor/online_bayesian_normalization.py index f5c7d99fd..14c260dfd 100644 --- a/data_utils/augmentor/online_bayesian_normalization.py +++ b/deepspeech/frontend/augmentor/online_bayesian_normalization.py @@ -13,7 +13,7 @@ # limitations under the License. """Contain the online bayesian normalization augmentation model.""" -from data_utils.augmentor.base import AugmentorBase +from deepspeech.frontend.augmentor.base import AugmentorBase class OnlineBayesianNormalizationAugmentor(AugmentorBase): diff --git a/data_utils/augmentor/resample.py b/deepspeech/frontend/augmentor/resample.py similarity index 95% rename from data_utils/augmentor/resample.py rename to deepspeech/frontend/augmentor/resample.py index 3732e09cd..8ef574cbb 100644 --- a/data_utils/augmentor/resample.py +++ b/deepspeech/frontend/augmentor/resample.py @@ -13,7 +13,7 @@ # limitations under the License. """Contain the resample augmentation model.""" -from data_utils.augmentor.base import AugmentorBase +from deepspeech.frontend.augmentor.base import AugmentorBase class ResampleAugmentor(AugmentorBase): diff --git a/data_utils/augmentor/shift_perturb.py b/deepspeech/frontend/augmentor/shift_perturb.py similarity index 96% rename from data_utils/augmentor/shift_perturb.py rename to deepspeech/frontend/augmentor/shift_perturb.py index 8b8e60362..2edbf594d 100644 --- a/data_utils/augmentor/shift_perturb.py +++ b/deepspeech/frontend/augmentor/shift_perturb.py @@ -13,7 +13,7 @@ # limitations under the License. """Contains the volume perturb augmentation model.""" -from data_utils.augmentor.base import AugmentorBase +from deepspeech.frontend.augmentor.base import AugmentorBase class ShiftPerturbAugmentor(AugmentorBase): diff --git a/data_utils/augmentor/speed_perturb.py b/deepspeech/frontend/augmentor/speed_perturb.py similarity index 97% rename from data_utils/augmentor/speed_perturb.py rename to deepspeech/frontend/augmentor/speed_perturb.py index 7b28f7ec6..6518382db 100644 --- a/data_utils/augmentor/speed_perturb.py +++ b/deepspeech/frontend/augmentor/speed_perturb.py @@ -13,7 +13,7 @@ # limitations under the License. """Contain the speech perturbation augmentation model.""" -from data_utils.augmentor.base import AugmentorBase +from deepspeech.frontend.augmentor.base import AugmentorBase class SpeedPerturbAugmentor(AugmentorBase): diff --git a/data_utils/augmentor/volume_perturb.py b/deepspeech/frontend/augmentor/volume_perturb.py similarity index 96% rename from data_utils/augmentor/volume_perturb.py rename to deepspeech/frontend/augmentor/volume_perturb.py index b98c7a3b4..dc64d0e9e 100644 --- a/data_utils/augmentor/volume_perturb.py +++ b/deepspeech/frontend/augmentor/volume_perturb.py @@ -13,7 +13,7 @@ # limitations under the License. """Contains the volume perturb augmentation model.""" -from data_utils.augmentor.base import AugmentorBase +from deepspeech.frontend.augmentor.base import AugmentorBase class VolumePerturbAugmentor(AugmentorBase): diff --git a/deploy/_init_paths.py b/deepspeech/frontend/featurizer/__init__.py similarity index 69% rename from deploy/_init_paths.py rename to deepspeech/frontend/featurizer/__init__.py index c4b28c643..185a92b8d 100644 --- a/deploy/_init_paths.py +++ b/deepspeech/frontend/featurizer/__init__.py @@ -11,19 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Set up paths for DS2""" - -import os.path -import sys - - -def add_path(path): - if path not in sys.path: - sys.path.insert(0, path) - - -this_dir = os.path.dirname(__file__) - -# Add project path to PYTHONPATH -proj_path = os.path.join(this_dir, '..') -add_path(proj_path) diff --git a/data_utils/featurizer/audio_featurizer.py b/deepspeech/frontend/featurizer/audio_featurizer.py similarity index 90% rename from data_utils/featurizer/audio_featurizer.py rename to deepspeech/frontend/featurizer/audio_featurizer.py index 0afd19870..b5edb32d5 100644 --- a/data_utils/featurizer/audio_featurizer.py +++ b/deepspeech/frontend/featurizer/audio_featurizer.py @@ -14,8 +14,8 @@ """Contains the audio featurizer class.""" import numpy as np -from data_utils.utility import read_manifest -from data_utils.audio import AudioSegment +from deepspeech.frontend.utility import read_manifest +from deepspeech.frontend.audio import AudioSegment from python_speech_features import mfcc from python_speech_features import delta @@ -52,6 +52,7 @@ class AudioFeaturizer(object): specgram_type='linear', stride_ms=10.0, window_ms=20.0, + n_fft=None, max_freq=None, target_sample_rate=16000, use_dB_normalization=True, @@ -63,6 +64,7 @@ class AudioFeaturizer(object): self._target_sample_rate = target_sample_rate self._use_dB_normalization = use_dB_normalization self._target_dB = target_dB + self._fft_point = n_fft def featurize(self, audio_segment, @@ -98,6 +100,22 @@ class AudioFeaturizer(object): return self._compute_specgram(audio_segment.samples, audio_segment.sample_rate) + @property + def feature_size(self): + """audio feature size""" + feat_dim = 0 + if self._specgram_type == 'linear': + fft_point = self._window_ms if self._fft_point is None else self._fft_point + feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 + + 1) + elif self._specgram_type == 'mfcc': + # mfcc,delta, delta-delta + feat_dim = int(13 * 3) + else: + raise ValueError("Unknown specgram_type %s. " + "Supported values: linear." % self._specgram_type) + return feat_dim + def _compute_specgram(self, samples, sample_rate): """Extract various audio features.""" if self._specgram_type == 'linear': @@ -150,7 +168,8 @@ class AudioFeaturizer(object): windows[:, 1] == samples[stride_size:(stride_size + window_size)]) # window weighting, squared Fast Fourier Transform (fft), scaling weighting = np.hanning(window_size)[:, None] - fft = np.fft.rfft(windows * weighting, axis=0) + # https://numpy.org/doc/stable/reference/generated/numpy.fft.rfft.html + fft = np.fft.rfft(windows * weighting, n=None, axis=0) fft = np.absolute(fft) fft = fft**2 scale = np.sum(weighting**2) * sample_rate diff --git a/data_utils/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py similarity index 91% rename from data_utils/featurizer/speech_featurizer.py rename to deepspeech/frontend/featurizer/speech_featurizer.py index 2e1424fa4..d4de96adc 100644 --- a/data_utils/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -13,8 +13,8 @@ # limitations under the License. """Contains the speech featurizer class.""" -from data_utils.featurizer.audio_featurizer import AudioFeaturizer -from data_utils.featurizer.text_featurizer import TextFeaturizer +from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer +from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer class SpeechFeaturizer(object): @@ -56,6 +56,7 @@ class SpeechFeaturizer(object): specgram_type='linear', stride_ms=10.0, window_ms=20.0, + n_fft=None, max_freq=None, target_sample_rate=16000, use_dB_normalization=True, @@ -64,6 +65,7 @@ class SpeechFeaturizer(object): specgram_type=specgram_type, stride_ms=stride_ms, window_ms=window_ms, + n_fft=n_fft, max_freq=max_freq, target_sample_rate=target_sample_rate, use_dB_normalization=use_dB_normalization, @@ -106,3 +108,12 @@ class SpeechFeaturizer(object): :rtype: list """ return self._text_featurizer.vocab_list + + @property + def feature_size(self): + """Return the audio feature size. + + :return: audio feature size. + :rtype: int + """ + return self._audio_featurizer.feature_size \ No newline at end of file diff --git a/data_utils/featurizer/text_featurizer.py b/deepspeech/frontend/featurizer/text_featurizer.py similarity index 92% rename from data_utils/featurizer/text_featurizer.py rename to deepspeech/frontend/featurizer/text_featurizer.py index 70aa10ead..a1e8cdbb1 100644 --- a/data_utils/featurizer/text_featurizer.py +++ b/deepspeech/frontend/featurizer/text_featurizer.py @@ -30,6 +30,7 @@ class TextFeaturizer(object): """ def __init__(self, vocab_filepath): + self.unk = '' self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file( vocab_filepath) @@ -43,7 +44,11 @@ class TextFeaturizer(object): :rtype: list """ tokens = self._char_tokenize(text) - return [self._vocab_dict[token] for token in tokens] + ids = [] + for token in tokens: + token = token if token in self._vocab_dict else self.unk + ids.append(self._vocab_dict[token]) + return ids @property def vocab_size(self): diff --git a/data_utils/normalizer.py b/deepspeech/frontend/normalizer.py similarity index 97% rename from data_utils/normalizer.py rename to deepspeech/frontend/normalizer.py index 83a008f10..8e50566c6 100644 --- a/data_utils/normalizer.py +++ b/deepspeech/frontend/normalizer.py @@ -15,8 +15,8 @@ import numpy as np import random -from data_utils.utility import read_manifest -from data_utils.audio import AudioSegment +from deepspeech.frontend.utility import read_manifest +from deepspeech.frontend.audio import AudioSegment class FeatureNormalizer(object): diff --git a/data_utils/speech.py b/deepspeech/frontend/speech.py similarity index 91% rename from data_utils/speech.py rename to deepspeech/frontend/speech.py index 01c1787a4..2883405bb 100644 --- a/data_utils/speech.py +++ b/deepspeech/frontend/speech.py @@ -14,28 +14,33 @@ """Contains the speech segment class.""" import numpy as np -from data_utils.audio import AudioSegment +from deepspeech.frontend.audio import AudioSegment class SpeechSegment(AudioSegment): - """Speech segment abstraction, a subclass of AudioSegment, - with an additional transcript. - - :param samples: Audio samples [num_samples x num_channels]. - :type samples: ndarray.float32 - :param sample_rate: Audio sample rate. - :type sample_rate: int - :param transcript: Transcript text for the speech. - :type transript: str - :raises TypeError: If the sample data type is not float or int. + """Speech Segment with Text + + Args: + AudioSegment (AudioSegment): Audio Segment """ def __init__(self, samples, sample_rate, transcript): + """Speech segment abstraction, a subclass of AudioSegment, + with an additional transcript. + + Args: + samples (ndarray.float32): Audio samples [num_samples x num_channels]. + sample_rate (int): Audio sample rate. + transcript (str): Transcript text for the speech. + """ AudioSegment.__init__(self, samples, sample_rate) self._transcript = transcript def __eq__(self, other): """Return whether two objects are equal. + + Returns: + bool: True, when equal to other """ if not AudioSegment.__eq__(self, other): return False diff --git a/data_utils/utility.py b/deepspeech/frontend/utility.py similarity index 53% rename from data_utils/utility.py rename to deepspeech/frontend/utility.py index 6cc1b2713..3694e106a 100644 --- a/data_utils/utility.py +++ b/deepspeech/frontend/utility.py @@ -20,6 +20,7 @@ import tarfile import time from threading import Thread from multiprocessing import Process, Manager, Value + from paddle.dataset.common import md5file @@ -49,51 +50,3 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): json_data["duration"] >= min_duration): manifest.append(json_data) return manifest - - -def getfile_insensitive(path): - """Get the actual file path when given insensitive filename.""" - directory, filename = os.path.split(path) - directory, filename = (directory or '.'), filename.lower() - for f in os.listdir(directory): - newpath = os.path.join(directory, f) - if os.path.isfile(newpath) and f.lower() == filename: - return newpath - - -def download_multi(url, target_dir, extra_args): - """Download multiple files from url to target_dir.""" - if not os.path.exists(target_dir): os.makedirs(target_dir) - print("Downloading %s ..." % url) - ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " + - target_dir) - return ret_code - - -def download(url, md5sum, target_dir): - """Download file from url to target_dir, and check md5sum.""" - if not os.path.exists(target_dir): os.makedirs(target_dir) - filepath = os.path.join(target_dir, url.split("/")[-1]) - if not (os.path.exists(filepath) and md5file(filepath) == md5sum): - print("Downloading %s ..." % url) - os.system("wget -c " + url + " -P " + target_dir) - print("\nMD5 Chesksum %s ..." % filepath) - if not md5file(filepath) == md5sum: - raise RuntimeError("MD5 checksum failed.") - else: - print("File exists, skip downloading. (%s)" % filepath) - return filepath - - -def unpack(filepath, target_dir, rm_tar=False): - """Unpack the file to the target_dir.""" - print("Unpacking %s ..." % filepath) - tar = tarfile.open(filepath) - tar.extractall(target_dir) - tar.close() - if rm_tar == True: - os.remove(filepath) - - -class XmapEndSignal(): - pass diff --git a/deepspeech/io/__init__.py b/deepspeech/io/__init__.py new file mode 100644 index 000000000..12e1d4d91 --- /dev/null +++ b/deepspeech/io/__init__.py @@ -0,0 +1,128 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.io import DataLoader + +from deepspeech.io.collator import SpeechCollator +from deepspeech.io.sampler import SortagradDistributedBatchSampler +from deepspeech.io.sampler import SortagradBatchSampler +from deepspeech.io.dataset import ManifestDataset + + +def create_dataloader(manifest_path, + vocab_filepath, + mean_std_filepath, + augmentation_config='{}', + max_duration=float('inf'), + min_duration=0.0, + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + specgram_type='linear', + use_dB_normalization=True, + random_seed=0, + keep_transcription_text=False, + is_training=False, + batch_size=1, + num_workers=0, + sortagrad=False, + shuffle_method=None, + dist=False): + + dataset = ManifestDataset( + manifest_path, + vocab_filepath, + mean_std_filepath, + augmentation_config=augmentation_config, + max_duration=max_duration, + min_duration=min_duration, + stride_ms=stride_ms, + window_ms=window_ms, + max_freq=max_freq, + specgram_type=specgram_type, + use_dB_normalization=use_dB_normalization, + random_seed=random_seed, + keep_transcription_text=keep_transcription_text) + + if dist: + batch_sampler = SortagradDistributedBatchSampler( + dataset, + batch_size, + num_replicas=None, + rank=None, + shuffle=is_training, + drop_last=is_training, + sortagrad=is_training, + shuffle_method=shuffle_method) + else: + batch_sampler = SortagradBatchSampler( + dataset, + shuffle=is_training, + batch_size=batch_size, + drop_last=is_training, + sortagrad=is_training, + shuffle_method=shuffle_method) + + def padding_batch(batch, padding_to=-1, flatten=False, is_training=True): + """ + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one bach. + + If ``padding_to`` is -1, the maximun shape in the batch will be used + as the target shape for padding. Otherwise, `padding_to` will be the + target shape (only refers to the second axis). + + If `flatten` is True, features will be flatten to 1darray. + """ + new_batch = [] + # get target shape + max_length = max([audio.shape[1] for audio, text in batch]) + if padding_to != -1: + if padding_to < max_length: + raise ValueError("If padding_to is not -1, it should be larger " + "than any instance's shape in the batch") + max_length = padding_to + max_text_length = max([len(text) for audio, text in batch]) + # padding + padded_audios = [] + audio_lens = [] + texts, text_lens = [], [] + for audio, text in batch: + padded_audio = np.zeros([audio.shape[0], max_length]) + padded_audio[:, :audio.shape[1]] = audio + if flatten: + padded_audio = padded_audio.flatten() + padded_audios.append(padded_audio) + audio_lens.append(audio.shape[1]) + + padded_text = np.zeros([max_text_length]) + if is_training: + padded_text[:len(text)] = text #ids + else: + padded_text[:len(text)] = [ord(t) for t in text] # string + texts.append(padded_text) + text_lens.append(len(text)) + + padded_audios = np.array(padded_audios).astype('float32') + audio_lens = np.array(audio_lens).astype('int64') + texts = np.array(texts).astype('int32') + text_lens = np.array(text_lens).astype('int64') + return padded_audios, texts, audio_lens, text_lens + + loader = DataLoader( + dataset, + batch_sampler=batch_sampler, + collate_fn=partial(padding_batch, is_training=is_training), + num_workers=num_workers) + return loader diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py new file mode 100644 index 000000000..10f838fb2 --- /dev/null +++ b/deepspeech/io/collator.py @@ -0,0 +1,73 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import numpy as np +from collections import namedtuple + +logger = logging.getLogger(__name__) + +__all__ = [ + "SpeechCollator", +] + + +class SpeechCollator(): + def __init__(self, padding_to=-1, is_training=True): + """ + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one bach. + + If ``padding_to`` is -1, the maximun shape in the batch will be used + as the target shape for padding. Otherwise, `padding_to` will be the + target shape (only refers to the second axis). + """ + self._padding_to = padding_to + self._is_training = is_training + + def __call__(self, batch): + new_batch = [] + # get target shape + max_length = max([audio.shape[1] for audio, _ in batch]) + if self._padding_to != -1: + if self._padding_to < max_length: + raise ValueError("If padding_to is not -1, it should be larger " + "than any instance's shape in the batch") + max_length = self._padding_to + max_text_length = max([len(text) for _, text in batch]) + # padding + padded_audios = [] + audio_lens = [] + texts, text_lens = [], [] + for audio, text in batch: + # audio + padded_audio = np.zeros([audio.shape[0], max_length]) + padded_audio[:, :audio.shape[1]] = audio + padded_audios.append(padded_audio) + audio_lens.append(audio.shape[1]) + # text + padded_text = np.zeros([max_text_length]) + if self._is_training: + padded_text[:len(text)] = text # token ids + else: + padded_text[:len(text)] = [ord(t) + for t in text] # string, unicode ord + texts.append(padded_text) + text_lens.append(len(text)) + + padded_audios = np.array(padded_audios).astype('float32') + audio_lens = np.array(audio_lens).astype('int64') + texts = np.array(texts).astype('int32') + text_lens = np.array(text_lens).astype('int64') + return padded_audios, texts, audio_lens, text_lens diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py new file mode 100644 index 000000000..b4c1c7afd --- /dev/null +++ b/deepspeech/io/dataset.py @@ -0,0 +1,206 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import random +import tarfile +import logging +import numpy as np +from collections import namedtuple +from functools import partial + +from paddle.io import Dataset + +from deepspeech.frontend.utility import read_manifest +from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline +from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer +from deepspeech.frontend.speech import SpeechSegment +from deepspeech.frontend.normalizer import FeatureNormalizer + +logger = logging.getLogger(__name__) + +__all__ = [ + "ManifestDataset", +] + + +class ManifestDataset(Dataset): + def __init__(self, + manifest_path, + vocab_filepath, + mean_std_filepath, + augmentation_config='{}', + max_duration=float('inf'), + min_duration=0.0, + stride_ms=10.0, + window_ms=20.0, + n_fft=None, + max_freq=None, + target_sample_rate=16000, + specgram_type='linear', + use_dB_normalization=True, + target_dB=-20, + random_seed=0, + keep_transcription_text=False): + """Manifest Dataset + + Args: + manifest_path (str): manifest josn file path + vocab_filepath (str): vocab file path + mean_std_filepath (str): mean and std file path, which suffix is *.npy + augmentation_config (str, optional): augmentation json str. Defaults to '{}'. + max_duration (float, optional): audio length in seconds must less than this. Defaults to float('inf'). + min_duration (float, optional): audio length is seconds must greater than this. Defaults to 0.0. + stride_ms (float, optional): stride size in ms. Defaults to 10.0. + window_ms (float, optional): window size in ms. Defaults to 20.0. + n_fft (int, optional): fft points for rfft. Defaults to None. + max_freq (int, optional): max cut freq. Defaults to None. + target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. + specgram_type (str, optional): 'linear' or 'mfcc'. Defaults to 'linear'. + use_dB_normalization (bool, optional): do dB normalization. Defaults to True. + target_dB (int, optional): target dB. Defaults to -20. + random_seed (int, optional): for random generator. Defaults to 0. + keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. + """ + super().__init__() + + self._max_duration = max_duration + self._min_duration = min_duration + self._normalizer = FeatureNormalizer(mean_std_filepath) + self._augmentation_pipeline = AugmentationPipeline( + augmentation_config=augmentation_config, random_seed=random_seed) + self._speech_featurizer = SpeechFeaturizer( + vocab_filepath=vocab_filepath, + specgram_type=specgram_type, + stride_ms=stride_ms, + window_ms=window_ms, + n_fft=n_fft, + max_freq=max_freq, + target_sample_rate=target_sample_rate, + use_dB_normalization=use_dB_normalization, + target_dB=target_dB) + self._rng = random.Random(random_seed) + self._keep_transcription_text = keep_transcription_text + # for caching tar files info + self._local_data = namedtuple('local_data', ['tar2info', 'tar2object']) + self._local_data.tar2info = {} + self._local_data.tar2object = {} + + # read manifest + self._manifest = read_manifest( + manifest_path=manifest_path, + max_duration=self._max_duration, + min_duration=self._min_duration) + self._manifest.sort(key=lambda x: x["duration"]) + + @property + def manifest(self): + return self._manifest + + @property + def vocab_size(self): + """Return the vocabulary size. + + :return: Vocabulary size. + :rtype: int + """ + return self._speech_featurizer.vocab_size + + @property + def vocab_list(self): + """Return the vocabulary in list. + + :return: Vocabulary in list. + :rtype: list + """ + return self._speech_featurizer.vocab_list + + @property + def feature_size(self): + return self._speech_featurizer.feature_size + + def _parse_tar(self, file): + """Parse a tar file to get a tarfile object + and a map containing tarinfoes + """ + result = {} + f = tarfile.open(file) + for tarinfo in f.getmembers(): + result[tarinfo.name] = tarinfo + return f, result + + def _subfile_from_tar(self, file): + """Get subfile object from tar. + + It will return a subfile object from tar file + and cached tar file info for next reading request. + """ + tarpath, filename = file.split(':', 1)[1].split('#', 1) + if 'tar2info' not in self._local_data.__dict__: + self._local_data.tar2info = {} + if 'tar2object' not in self._local_data.__dict__: + self._local_data.tar2object = {} + if tarpath not in self._local_data.tar2info: + object, infoes = self._parse_tar(tarpath) + self._local_data.tar2info[tarpath] = infoes + self._local_data.tar2object[tarpath] = object + return self._local_data.tar2object[tarpath].extractfile( + self._local_data.tar2info[tarpath][filename]) + + def process_utterance(self, audio_file, transcript): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of audio file. + :type audio_file: str | file + :param transcript: Transcription text. + :type transcript: str + :return: Tuple of audio feature tensor and data of transcription part, + where transcription part could be token ids or text. + :rtype: tuple of (2darray, list) + """ + if isinstance(audio_file, str) and audio_file.startswith('tar:'): + speech_segment = SpeechSegment.from_file( + self._subfile_from_tar(audio_file), transcript) + else: + speech_segment = SpeechSegment.from_file(audio_file, transcript) + self._augmentation_pipeline.transform_audio(speech_segment) + specgram, transcript_part = self._speech_featurizer.featurize( + speech_segment, self._keep_transcription_text) + specgram = self._normalizer.apply(specgram) + return specgram, transcript_part + + def _instance_reader_creator(self, manifest): + """ + Instance reader creator. Create a callable function to produce + instances of data. + + Instance: a tuple of ndarray of audio spectrogram and a list of + token indices for transcript. + """ + + def reader(): + for instance in manifest: + inst = self.process_utterance(instance["audio_filepath"], + instance["text"]) + yield inst + + return reader + + def __len__(self): + return len(self._manifest) + + def __getitem__(self, idx): + instance = self._manifest[idx] + return self.process_utterance(instance["audio_filepath"], + instance["text"]) diff --git a/deepspeech/io/sampler.py b/deepspeech/io/sampler.py new file mode 100644 index 000000000..5bc49dad8 --- /dev/null +++ b/deepspeech/io/sampler.py @@ -0,0 +1,256 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import random +import tarfile +import logging +import numpy as np +from collections import namedtuple +from functools import partial + +import paddle +from paddle.io import BatchSampler +from paddle.io import DistributedBatchSampler +from paddle import distributed as dist + +logger = logging.getLogger(__name__) + +__all__ = [ + "SortagradDistributedBatchSampler", + "SortagradBatchSampler", +] + + +def _batch_shuffle(indices, batch_size, epoch, clipped=False): + """Put similarly-sized instances into minibatches for better efficiency + and make a batch-wise shuffle. + + 1. Sort the audio clips by duration. + 2. Generate a random number `k`, k in [0, batch_size). + 3. Randomly shift `k` instances in order to create different batches + for different epochs. Create minibatches. + 4. Shuffle the minibatches. + + :param indices: indexes. List of int. + :type indices: list + :param batch_size: Batch size. This size is also used for generate + a random number for batch shuffle. + :type batch_size: int + :param clipped: Whether to clip the heading (small shift) and trailing + (incomplete batch) instances. + :type clipped: bool + :return: Batch shuffled mainifest. + :rtype: list + """ + rng = np.random.RandomState(epoch) + shift_len = rng.randint(0, batch_size - 1) + batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) + rng.shuffle(batch_indices) + batch_indices = [item for batch in batch_indices for item in batch] + assert (clipped == False) + if not clipped: + res_len = len(indices) - shift_len - len(batch_indices) + # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:]) + if res_len != 0: + batch_indices.extend(indices[-res_len:]) + batch_indices.extend(indices[0:shift_len]) + assert len(indices) == len( + batch_indices + ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}" + return batch_indices + + +class SortagradDistributedBatchSampler(DistributedBatchSampler): + def __init__(self, + dataset, + batch_size, + num_replicas=None, + rank=None, + shuffle=False, + drop_last=False, + sortagrad=False, + shuffle_method="batch_shuffle"): + """Sortagrad Sampler for multi gpus. + + Args: + dataset (paddle.io.Dataset): + batch_size (int): batch size for one gpu + num_replicas (int, optional): world size or numbers of gpus. Defaults to None. + rank (int, optional): rank id. Defaults to None. + shuffle (bool, optional): True for do shuffle, or else. Defaults to False. + drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False. + sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False. + shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle". + """ + super().__init__(dataset, batch_size, num_replicas, rank, shuffle, + drop_last) + self._sortagrad = sortagrad + self._shuffle_method = shuffle_method + + def __iter__(self): + num_samples = len(self.dataset) + indices = np.arange(num_samples).tolist() + indices += indices[:(self.total_size - len(indices))] + assert len(indices) == self.total_size + + # sort (by duration) or batch-wise shuffle the manifest + if self.shuffle: + if self.epoch == 0 and self._sortagrad: + logger.info( + f'rank: {dist.get_rank()} dataset sortagrad! epoch {self.epoch}' + ) + else: + logger.info( + f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}' + ) + if self._shuffle_method == "batch_shuffle": + # using `batch_size * nrank`, or will cause instability loss and nan or inf grad, + # since diff batch examlpe length in batches case instability loss in diff rank, + # e.g. rank0 maxlength 20, rank3 maxlength 1000 + indices = _batch_shuffle( + indices, + self.batch_size * self.nranks, + self.epoch, + clipped=False) + elif self._shuffle_method == "instance_shuffle": + np.random.RandomState(self.epoch).shuffle(indices) + else: + raise ValueError("Unknown shuffle method %s." % + self._shuffle_method) + assert len( + indices + ) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}" + + # slice `self.batch_size` examples by rank id + def _get_indices_by_batch_size(indices): + subsampled_indices = [] + last_batch_size = self.total_size % (self.batch_size * self.nranks) + assert last_batch_size % self.nranks == 0 + last_local_batch_size = last_batch_size // self.nranks + + for i in range(self.local_rank * self.batch_size, + len(indices) - last_batch_size, + self.batch_size * self.nranks): + subsampled_indices.extend(indices[i:i + self.batch_size]) + + indices = indices[len(indices) - last_batch_size:] + subsampled_indices.extend( + indices[self.local_rank * last_local_batch_size:( + self.local_rank + 1) * last_local_batch_size]) + return subsampled_indices + + if self.nranks > 1: + indices = _get_indices_by_batch_size(indices) + + assert len(indices) == self.num_samples + _sample_iter = iter(indices) + + batch_indices = [] + for idx in _sample_iter: + batch_indices.append(idx) + if len(batch_indices) == self.batch_size: + logger.info( + f"rank: {dist.get_rank()} batch index: {batch_indices} ") + yield batch_indices + batch_indices = [] + if not self.drop_last and len(batch_indices) > 0: + yield batch_indices + + def __len__(self): + num_samples = self.num_samples + num_samples += int(not self.drop_last) * (self.batch_size - 1) + return num_samples // self.batch_size + + +class SortagradBatchSampler(BatchSampler): + def __init__(self, + dataset, + batch_size, + shuffle=False, + drop_last=False, + sortagrad=False, + shuffle_method="batch_shuffle"): + """Sortagrad Sampler for one gpu. + + Args: + dataset (paddle.io.Dataset): + batch_size (int): batch size for one gpu + shuffle (bool, optional): True for do shuffle, or else. Defaults to False. + drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False. + sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False. + shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle". + """ + self.dataset = dataset + + assert isinstance(batch_size, int) and batch_size > 0, \ + "batch_size should be a positive integer" + self.batch_size = batch_size + assert isinstance(shuffle, bool), \ + "shuffle should be a boolean value" + self.shuffle = shuffle + assert isinstance(drop_last, bool), \ + "drop_last should be a boolean number" + + self.drop_last = drop_last + self.epoch = 0 + self.num_samples = int(math.ceil(len(self.dataset) * 1.0)) + self.total_size = self.num_samples + self._sortagrad = sortagrad + self._shuffle_method = shuffle_method + + def __iter__(self): + num_samples = len(self.dataset) + indices = np.arange(num_samples).tolist() + indices += indices[:(self.total_size - len(indices))] + assert len(indices) == self.total_size + + # sort (by duration) or batch-wise shuffle the manifest + if self.shuffle: + if self.epoch == 0 and self._sortagrad: + logger.info(f'dataset sortagrad! epoch {self.epoch}') + else: + logger.info(f'dataset shuffle! epoch {self.epoch}') + if self._shuffle_method == "batch_shuffle": + indices = _batch_shuffle( + indices, self.batch_size, self.epoch, clipped=False) + elif self._shuffle_method == "instance_shuffle": + np.random.RandomState(self.epoch).shuffle(indices) + else: + raise ValueError("Unknown shuffle method %s." % + self._shuffle_method) + assert len( + indices + ) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}" + + assert len(indices) == self.num_samples + _sample_iter = iter(indices) + + batch_indices = [] + for idx in _sample_iter: + batch_indices.append(idx) + if len(batch_indices) == self.batch_size: + logger.info( + f"rank: {dist.get_rank()} batch index: {batch_indices} ") + yield batch_indices + batch_indices = [] + if not self.drop_last and len(batch_indices) > 0: + yield batch_indices + + self.epoch += 1 + + def __len__(self): + num_samples = self.num_samples + num_samples += int(not self.drop_last) * (self.batch_size - 1) + return num_samples // self.batch_size diff --git a/decoders/swig/_init_paths.py b/deepspeech/models/__init__.py similarity index 69% rename from decoders/swig/_init_paths.py rename to deepspeech/models/__init__.py index c4b28c643..185a92b8d 100644 --- a/decoders/swig/_init_paths.py +++ b/deepspeech/models/__init__.py @@ -11,19 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Set up paths for DS2""" - -import os.path -import sys - - -def add_path(path): - if path not in sys.path: - sys.path.insert(0, path) - - -this_dir = os.path.dirname(__file__) - -# Add project path to PYTHONPATH -proj_path = os.path.join(this_dir, '..') -add_path(proj_path) diff --git a/deepspeech/models/deepspeech2.py b/deepspeech/models/deepspeech2.py new file mode 100644 index 000000000..b58260749 --- /dev/null +++ b/deepspeech/models/deepspeech2.py @@ -0,0 +1,442 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import collections +import numpy as np +import logging +from typing import Optional +from yacs.config import CfgNode + +import paddle +from paddle import nn +from paddle.nn import functional as F +from paddle.nn import initializer as I + +from deepspeech.modules.conv import ConvStack +from deepspeech.modules.rnn import RNNStack +from deepspeech.modules.mask import sequence_mask +from deepspeech.modules.activation import brelu +from deepspeech.utils import checkpoint +from deepspeech.utils import layer_tools +from deepspeech.decoders.swig_wrapper import Scorer +from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder +from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch + +from deepspeech.modules.loss import CTCLoss + +logger = logging.getLogger(__name__) + +__all__ = ['DeepSpeech2Model'] + + +class CRNNEncoder(nn.Layer): + def __init__(self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=True): + super().__init__() + self.rnn_size = rnn_size + self.feat_size = feat_size # 161 for linear + self.dict_size = dict_size + + self.conv = ConvStack(feat_size, num_conv_layers) + + i_size = self.conv.output_height # H after conv stack + self.rnn = RNNStack( + i_size=i_size, + h_size=rnn_size, + num_stacks=num_rnn_layers, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) + + @property + def output_size(self): + return self.rnn_size * 2 + + def forward(self, audio, audio_len): + """ + audio: shape [B, D, T] + text: shape [B, T] + audio_len: shape [B] + text_len: shape [B] + """ + """Compute Encoder outputs + + Args: + audio (Tensor): [B, D, T] + text (Tensor): [B, T] + audio_len (Tensor): [B] + text_len (Tensor): [B] + Returns: + x (Tensor): encoder outputs, [B, T, D] + x_lens (Tensor): encoder length, [B] + """ + # [B, D, T] -> [B, C=1, D, T] + x = audio.unsqueeze(1) + x_lens = audio_len + + # convolution group + x, x_lens = self.conv(x, x_lens) + + # convert data from convolution feature map to sequence of vectors + #B, C, D, T = paddle.shape(x) # not work under jit + x = x.transpose([0, 3, 1, 2]) #[B, T, C, D] + #x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit + x = x.reshape([0, 0, -1]) #[B, T, C*D] + + # remove padding part + x, x_lens = self.rnn(x, x_lens) #[B, T, D] + return x, x_lens + + +class CTCDecoder(nn.Layer): + def __init__(self, enc_n_units, vocab_size): + super().__init__() + self.blank_id = vocab_size + self.output = nn.Linear(enc_n_units, + vocab_size + 1) # blank id is last id + self.criterion = CTCLoss(self.blank_id) + + self._ext_scorer = None + + def forward(self, eout, eout_lens, texts, texts_len): + """Compute CTC Loss + + Args: + eout (Tensor): + eout_lens (Tensor): + texts (Tenosr): + texts_len (Tensor): + Returns: + loss (Tenosr): [1] + """ + logits = self.output(eout) + loss = self.criterion(logits, texts, eout_lens, texts_len) + return loss + + def probs(self, eouts, temperature=1.): + """Get CTC probabilities. + Args: + eouts (FloatTensor): `[B, T, enc_units]` + Returns: + probs (FloatTensor): `[B, T, vocab]` + """ + return F.softmax(self.output(eouts) / temperature, axis=-1) + + def scores(self, eouts, temperature=1.): + """Get log-scale CTC probabilities. + Args: + eouts (FloatTensor): `[B, T, enc_units]` + Returns: + log_probs (FloatTensor): `[B, T, vocab]` + """ + return F.log_softmax(self.output(eouts) / temperature, axis=-1) + + def _decode_batch_greedy(self, probs_split, vocab_list): + """Decode by best path for a batch of probs matrix input. + :param probs_split: List of 2-D probability matrix, and each consists + of prob vectors for one speech utterancce. + :param probs_split: List of matrix + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + :return: List of transcription texts. + :rtype: List of str + """ + results = [] + for i, probs in enumerate(probs_split): + output_transcription = ctc_greedy_decoder( + probs_seq=probs, vocabulary=vocab_list) + results.append(output_transcription) + return results + + def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path, + vocab_list): + """Initialize the external scorer. + :param beam_alpha: Parameter associated with language model. + :type beam_alpha: float + :param beam_beta: Parameter associated with word count. + :type beam_beta: float + :param language_model_path: Filepath for language model. If it is + empty, the external scorer will be set to + None, and the decoding method will be pure + beam search without scorer. + :type language_model_path: str|None + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + """ + # init once + if self._ext_scorer != None: + return + + if language_model_path != '': + logger.info("begin to initialize the external scorer " + "for decoding") + self._ext_scorer = Scorer(beam_alpha, beam_beta, + language_model_path, vocab_list) + lm_char_based = self._ext_scorer.is_character_based() + lm_max_order = self._ext_scorer.get_max_order() + lm_dict_size = self._ext_scorer.get_dict_size() + logger.info("language model: " + "is_character_based = %d," % lm_char_based + + " max_order = %d," % lm_max_order + " dict_size = %d" % + lm_dict_size) + logger.info("end initializing scorer") + else: + self._ext_scorer = None + logger.info("no language model provided, " + "decoding by pure beam search without scorer.") + + def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta, + beam_size, cutoff_prob, cutoff_top_n, + vocab_list, num_processes): + """Decode by beam search for a batch of probs matrix input. + :param probs_split: List of 2-D probability matrix, and each consists + of prob vectors for one speech utterancce. + :param probs_split: List of matrix + :param beam_alpha: Parameter associated with language model. + :type beam_alpha: float + :param beam_beta: Parameter associated with word count. + :type beam_beta: float + :param beam_size: Width for Beam search. + :type beam_size: int + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n + characters with highest probs in vocabulary will be + used in beam search, default 40. + :type cutoff_top_n: int + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + :param num_processes: Number of processes (CPU) for decoder. + :type num_processes: int + :return: List of transcription texts. + :rtype: List of str + """ + if self._ext_scorer != None: + self._ext_scorer.reset_params(beam_alpha, beam_beta) + + # beam search decode + num_processes = min(num_processes, len(probs_split)) + beam_search_results = ctc_beam_search_decoder_batch( + probs_split=probs_split, + vocabulary=vocab_list, + beam_size=beam_size, + num_processes=num_processes, + ext_scoring_func=self._ext_scorer, + cutoff_prob=cutoff_prob, + cutoff_top_n=cutoff_top_n) + + results = [result[0][1] for result in beam_search_results] + return results + + def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list, + decoding_method): + if decoding_method == "ctc_beam_search": + self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path, + vocab_list) + + def decode_probs(self, probs, logits_lens, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, + cutoff_prob, cutoff_top_n, num_processes): + """ probs: activation after softmax + logits_len: audio output lens + """ + probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)] + if decoding_method == "ctc_greedy": + result_transcripts = self._decode_batch_greedy( + probs_split=probs_split, vocab_list=vocab_list) + elif decoding_method == "ctc_beam_search": + result_transcripts = self._decode_batch_beam_search( + probs_split=probs_split, + beam_alpha=beam_alpha, + beam_beta=beam_beta, + beam_size=beam_size, + cutoff_prob=cutoff_prob, + cutoff_top_n=cutoff_top_n, + vocab_list=vocab_list, + num_processes=num_processes) + else: + raise ValueError(f"Not support: {decoding_method}") + return result_transcripts + + +class DeepSpeech2Model(nn.Layer): + """The DeepSpeech2 network structure. + + :param audio_data: Audio spectrogram data layer. + :type audio_data: Variable + :param text_data: Transcription text data layer. + :type text_data: Variable + :param audio_len: Valid sequence length data layer. + :type audio_len: Variable + :param masks: Masks data layer to reset padding. + :type masks: Variable + :param dict_size: Dictionary size for tokenized transcription. + :type dict_size: int + :param num_conv_layers: Number of stacking convolution layers. + :type num_conv_layers: int + :param num_rnn_layers: Number of stacking RNN layers. + :type num_rnn_layers: int + :param rnn_size: RNN layer size (dimension of RNN cells). + :type rnn_size: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward direction RNNs. + It is only available when use_gru=False. + :type share_weights: bool + :return: A tuple of an output unnormalized log probability layer ( + before softmax) and a ctc cost layer. + :rtype: tuple of LayerOutput + """ + + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + default = CfgNode( + dict( + num_conv_layers=2, #Number of stacking convolution layers. + num_rnn_layers=3, #Number of stacking RNN layers. + rnn_layer_size=1024, #RNN layer size (number of RNN cells). + use_gru=True, #Use gru if set True. Use simple rnn if set False. + share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. + )) + if config is not None: + config.merge_from_other_cfg(default) + return default + + def __init__(self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=True): + super().__init__() + self.encoder = CRNNEncoder( + feat_size=feat_size, + dict_size=dict_size, + num_conv_layers=num_conv_layers, + num_rnn_layers=num_rnn_layers, + rnn_size=rnn_size, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) + assert (self.encoder.output_size == rnn_size * 2) + self.decoder = CTCDecoder( + enc_n_units=self.encoder.output_size, vocab_size=dict_size) + + def forward(self, audio, text, audio_len, text_len): + """Compute Model loss + + Args: + audio (Tenosr): [B, D, T] + text (Tensor): [B, T] + audio_len (Tensor): [B] + text_len (Tensor): [B] + + Returns: + loss (Tenosr): [1] + """ + + eouts, eouts_len = self.encoder(audio, audio_len) + loss = self.decoder(eouts, eouts_len, text, text_len) + return loss + + @paddle.no_grad() + def decode(self, audio, audio_len, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, + cutoff_top_n, num_processes): + # init once + # decoders only accept string encoded in utf-8 + self.decoder.init_decode( + beam_alpha=beam_alpha, + beam_beta=beam_beta, + lang_model_path=lang_model_path, + vocab_list=vocab_list, + decoding_method=decoding_method) + + eouts, eouts_len = self.encoder(audio, audio_len) + probs = self.decoder.probs(eouts) + return self.decoder.decode_probs( + probs.numpy(), eouts_len, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, + cutoff_top_n, num_processes) + + @classmethod + def from_pretrained(cls, dataset, config, checkpoint_path): + """Build a DeepSpeech2Model model from a pretrained model. + Parameters + ---------- + dataset: paddle.io.Dataset + + config: yacs.config.CfgNode + model configs + + checkpoint_path: Path or str + the path of pretrained model checkpoint, without extension name + + Returns + ------- + DeepSpeech2Model + The model built from pretrained result. + """ + model = cls(feat_size=dataset.feature_size, + dict_size=dataset.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) + checkpoint.load_parameters(model, checkpoint_path=checkpoint_path) + layer_tools.summary(model) + return model + + +class DeepSpeech2InferModel(DeepSpeech2Model): + def __init__(self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=True): + super().__init__( + feat_size=feat_size, + dict_size=dict_size, + num_conv_layers=num_conv_layers, + num_rnn_layers=num_rnn_layers, + rnn_size=rnn_size, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) + + def forward(self, audio, audio_len): + """export model function + + Args: + audio (Tensor): [B, D, T] + audio_len (Tensor): [B] + + Returns: + probs: probs after softmax + """ + eouts, eouts_len = self.encoder(audio, audio_len) + probs = self.decoder.probs(eouts) + return probs diff --git a/deepspeech/modules/__init__.py b/deepspeech/modules/__init__.py new file mode 100644 index 000000000..185a92b8d --- /dev/null +++ b/deepspeech/modules/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/deepspeech/modules/activation.py b/deepspeech/modules/activation.py new file mode 100644 index 000000000..14861fcf7 --- /dev/null +++ b/deepspeech/modules/activation.py @@ -0,0 +1,32 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import numpy as np + +import paddle +from paddle import nn +from paddle.nn import functional as F +from paddle.nn import initializer as I + +logger = logging.getLogger(__name__) + +__all__ = ['brelu'] + + +def brelu(x, t_min=0.0, t_max=24.0, name=None): + # paddle.to_tensor is dygraph_only can not work under JIT + t_min = paddle.full(shape=[1], fill_value=t_min, dtype='float32') + t_max = paddle.full(shape=[1], fill_value=t_max, dtype='float32') + return x.maximum(t_min).minimum(t_max) diff --git a/deepspeech/modules/conv.py b/deepspeech/modules/conv.py new file mode 100644 index 000000000..7d64c963d --- /dev/null +++ b/deepspeech/modules/conv.py @@ -0,0 +1,147 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import paddle +from paddle import nn +from paddle.nn import functional as F +from paddle.nn import initializer as I + +from deepspeech.modules.mask import sequence_mask +from deepspeech.modules.activation import brelu + +logger = logging.getLogger(__name__) + +__all__ = ['ConvStack'] + + +class ConvBn(nn.Layer): + """Convolution layer with batch normalization. + + :param kernel_size: The x dimension of a filter kernel. Or input a tuple for + two image dimension. + :type kernel_size: int|tuple|list + :param num_channels_in: Number of input channels. + :type num_channels_in: int + :param num_channels_out: Number of output channels. + :type num_channels_out: int + :param stride: The x dimension of the stride. Or input a tuple for two + image dimension. + :type stride: int|tuple|list + :param padding: The x dimension of the padding. Or input a tuple for two + image dimension. + :type padding: int|tuple|list + :param act: Activation type, relu|brelu + :type act: string + :return: Batch norm layer after convolution layer. + :rtype: Variable + + """ + + def __init__(self, num_channels_in, num_channels_out, kernel_size, stride, + padding, act): + + super().__init__() + assert len(kernel_size) == 2 + assert len(stride) == 2 + assert len(padding) == 2 + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + + self.conv = nn.Conv2D( + num_channels_in, + num_channels_out, + kernel_size=kernel_size, + stride=stride, + padding=padding, + weight_attr=None, + bias_attr=False, + data_format='NCHW') + + self.bn = nn.BatchNorm2D( + num_channels_out, + weight_attr=None, + bias_attr=None, + data_format='NCHW') + self.act = F.relu if act == 'relu' else brelu + + def forward(self, x, x_len): + """ + x(Tensor): audio, shape [B, C, D, T] + """ + x = self.conv(x) + x = self.bn(x) + x = self.act(x) + + x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1] + ) // self.stride[1] + 1 + + # reset padding part to 0 + masks = sequence_mask(x_len) #[B, T] + masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] + x = x.multiply(masks) + + return x, x_len + + +class ConvStack(nn.Layer): + """Convolution group with stacked convolution layers. + + :param feat_size: audio feature dim. + :type feat_size: int + :param num_stacks: Number of stacked convolution layers. + :type num_stacks: int + """ + + def __init__(self, feat_size, num_stacks): + super().__init__() + self.feat_size = feat_size # D + self.num_stacks = num_stacks + + self.conv_in = ConvBn( + num_channels_in=1, + num_channels_out=32, + kernel_size=(41, 11), #[D, T] + stride=(2, 3), + padding=(20, 5), + act='brelu') + + out_channel = 32 + self.conv_stack = nn.LayerList([ + ConvBn( + num_channels_in=32, + num_channels_out=out_channel, + kernel_size=(21, 11), + stride=(2, 1), + padding=(10, 5), + act='brelu') for i in range(num_stacks - 1) + ]) + + # conv output feat_dim + output_height = (feat_size - 1) // 2 + 1 + for i in range(self.num_stacks - 1): + output_height = (output_height - 1) // 2 + 1 + self.output_height = out_channel * output_height + + def forward(self, x, x_len): + """ + x: shape [B, C, D, T] + x_len : shape [B] + """ + x, x_len = self.conv_in(x, x_len) + for i, conv in enumerate(self.conv_stack): + x, x_len = conv(x, x_len) + return x, x_len diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py new file mode 100644 index 000000000..b0e021a59 --- /dev/null +++ b/deepspeech/modules/loss.py @@ -0,0 +1,65 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import paddle +from paddle import nn +from paddle.nn import functional as F +from paddle.nn import initializer as I + +logger = logging.getLogger(__name__) + +__all__ = ['CTCLoss'] + + +def ctc_loss(logits, + labels, + input_lengths, + label_lengths, + blank=0, + reduction='mean', + norm_by_times=True): + #logger.info("my ctc loss with norm by times") + ## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403 + loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times, + input_lengths, label_lengths) + + loss_out = paddle.fluid.layers.squeeze(loss_out, [-1]) + logger.info(f"warpctc loss: {loss_out}/{loss_out.shape} ") + assert reduction in ['mean', 'sum', 'none'] + if reduction == 'mean': + loss_out = paddle.mean(loss_out / label_lengths) + elif reduction == 'sum': + loss_out = paddle.sum(loss_out) + logger.info(f"ctc loss: {loss_out}") + return loss_out + + +F.ctc_loss = ctc_loss + + +class CTCLoss(nn.Layer): + def __init__(self, blank_id): + super().__init__() + # last token id as blank id + self.loss = nn.CTCLoss(blank=blank_id, reduction='sum') + + def forward(self, logits, text, logits_len, text_len): + # warp-ctc do softmax on activations + # warp-ctc need activation with shape [T, B, V + 1] + logits = logits.transpose([1, 0, 2]) + + ctc_loss = self.loss(logits, text, logits_len, text_len) + return ctc_loss diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py new file mode 100644 index 000000000..cb036c141 --- /dev/null +++ b/deepspeech/modules/mask.py @@ -0,0 +1,34 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import paddle +from paddle import nn +from paddle.nn import functional as F +from paddle.nn import initializer as I + +logger = logging.getLogger(__name__) + +__all__ = ['sequence_mask'] + + +def sequence_mask(x_len, max_len=None, dtype='float32'): + max_len = max_len or x_len.max() + x_len = paddle.unsqueeze(x_len, -1) + row_vector = paddle.arange(max_len) + #mask = row_vector < x_len + mask = row_vector > x_len # a bug, broadcast 的时候出错了 + mask = paddle.cast(mask, dtype) + return mask diff --git a/deepspeech/modules/rnn.py b/deepspeech/modules/rnn.py new file mode 100644 index 000000000..3cb8c7d05 --- /dev/null +++ b/deepspeech/modules/rnn.py @@ -0,0 +1,310 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import logging + +import paddle +from paddle import nn +from paddle.nn import functional as F +from paddle.nn import initializer as I + +from deepspeech.modules.mask import sequence_mask +from deepspeech.modules.activation import brelu + +logger = logging.getLogger(__name__) + +__all__ = ['RNNStack'] + + +class RNNCell(nn.RNNCellBase): + r""" + Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it + computes the outputs and updates states. + The formula used is as follows: + .. math:: + h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh}) + y_{t} & = h_{t} + + where :math:`act` is for :attr:`activation`. + """ + + def __init__(self, + hidden_size, + activation="tanh", + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, + name=None): + super().__init__() + std = 1.0 / math.sqrt(hidden_size) + self.weight_hh = self.create_parameter( + (hidden_size, hidden_size), + weight_hh_attr, + default_initializer=I.Uniform(-std, std)) + self.bias_ih = None + self.bias_hh = self.create_parameter( + (hidden_size, ), + bias_hh_attr, + is_bias=True, + default_initializer=I.Uniform(-std, std)) + + self.hidden_size = hidden_size + if activation not in ["tanh", "relu", "brelu"]: + raise ValueError( + "activation for SimpleRNNCell should be tanh or relu, " + "but get {}".format(activation)) + self.activation = activation + self._activation_fn = paddle.tanh \ + if activation == "tanh" \ + else F.relu + if activation == 'brelu': + self._activation_fn = brelu + + def forward(self, inputs, states=None): + if states is None: + states = self.get_initial_states(inputs, self.state_shape) + pre_h = states + i2h = inputs + if self.bias_ih is not None: + i2h += self.bias_ih + h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True) + if self.bias_hh is not None: + h2h += self.bias_hh + h = self._activation_fn(i2h + h2h) + return h, h + + @property + def state_shape(self): + return (self.hidden_size, ) + + +class GRUCell(nn.RNNCellBase): + r""" + Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, + it computes the outputs and updates states. + The formula for GRU used is as follows: + .. math:: + r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr}) + z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz}) + \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc})) + h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t} + y_{t} & = h_{t} + + where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise + multiplication operator. + """ + + def __init__(self, + input_size, + hidden_size, + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, + name=None): + super().__init__() + std = 1.0 / math.sqrt(hidden_size) + self.weight_hh = self.create_parameter( + (3 * hidden_size, hidden_size), + weight_hh_attr, + default_initializer=I.Uniform(-std, std)) + self.bias_ih = None + self.bias_hh = self.create_parameter( + (3 * hidden_size, ), + bias_hh_attr, + is_bias=True, + default_initializer=I.Uniform(-std, std)) + + self.hidden_size = hidden_size + self.input_size = input_size + self._gate_activation = F.sigmoid + self._activation = paddle.tanh + #self._activation = F.relu + + def forward(self, inputs, states=None): + if states is None: + states = self.get_initial_states(inputs, self.state_shape) + + pre_hidden = states + x_gates = inputs + if self.bias_ih is not None: + x_gates = x_gates + self.bias_ih + h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True) + if self.bias_hh is not None: + h_gates = h_gates + self.bias_hh + + x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1) + h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1) + + r = self._gate_activation(x_r + h_r) + z = self._gate_activation(x_z + h_z) + c = self._activation(x_c + r * h_c) # apply reset gate after mm + h = (pre_hidden - c) * z + c + # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru + + return h, h + + @property + def state_shape(self): + r""" + The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch + size would be automatically inserted into shape). The shape corresponds + to the shape of :math:`h_{t-1}`. + """ + return (self.hidden_size, ) + + +class BiRNNWithBN(nn.Layer): + """Bidirectonal simple rnn layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + + :param name: Name of the layer parameters. + :type name: string + :param size: Dimension of RNN cells. + :type size: int + :param share_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + :type share_weights: bool + :return: Bidirectional simple rnn layer. + :rtype: Variable + """ + + def __init__(self, i_size, h_size, share_weights): + super().__init__() + self.share_weights = share_weights + if self.share_weights: + #input-hidden weights shared between bi-directional rnn. + self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) + # batch norm is only performed on input-state projection + self.fw_bn = nn.BatchNorm1D( + h_size, bias_attr=None, data_format='NLC') + self.bw_fc = self.fw_fc + self.bw_bn = self.fw_bn + else: + self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) + self.fw_bn = nn.BatchNorm1D( + h_size, bias_attr=None, data_format='NLC') + self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False) + self.bw_bn = nn.BatchNorm1D( + h_size, bias_attr=None, data_format='NLC') + + self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu') + self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu') + self.fw_rnn = nn.RNN( + self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] + self.bw_rnn = nn.RNN( + self.fw_cell, is_reverse=True, time_major=False) #[B, T, D] + + def forward(self, x, x_len): + # x, shape [B, T, D] + fw_x = self.fw_bn(self.fw_fc(x)) + bw_x = self.bw_bn(self.bw_fc(x)) + fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) + bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) + x = paddle.concat([fw_x, bw_x], axis=-1) + return x, x_len + + +class BiGRUWithBN(nn.Layer): + """Bidirectonal gru layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + + :param name: Name of the layer. + :type name: string + :param input: Input layer. + :type input: Variable + :param size: Dimension of GRU cells. + :type size: int + :param act: Activation type. + :type act: string + :return: Bidirectional GRU layer. + :rtype: Variable + """ + + def __init__(self, i_size, h_size, act): + super().__init__() + hidden_size = h_size * 3 + + self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) + self.fw_bn = nn.BatchNorm1D( + hidden_size, bias_attr=None, data_format='NLC') + self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) + self.bw_bn = nn.BatchNorm1D( + hidden_size, bias_attr=None, data_format='NLC') + + self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) + self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) + self.fw_rnn = nn.RNN( + self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] + self.bw_rnn = nn.RNN( + self.fw_cell, is_reverse=True, time_major=False) #[B, T, D] + + def forward(self, x, x_len): + # x, shape [B, T, D] + fw_x = self.fw_bn(self.fw_fc(x)) + bw_x = self.bw_bn(self.bw_fc(x)) + fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) + bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) + x = paddle.concat([fw_x, bw_x], axis=-1) + return x, x_len + + +class RNNStack(nn.Layer): + """RNN group with stacked bidirectional simple RNN or GRU layers. + + :param input: Input layer. + :type input: Variable + :param size: Dimension of RNN cells in each layer. + :type size: int + :param num_stacks: Number of stacked rnn layers. + :type num_stacks: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + It is only available when use_gru=False. + :type share_weights: bool + :return: Output layer of the RNN group. + :rtype: Variable + """ + + def __init__(self, i_size, h_size, num_stacks, use_gru, share_rnn_weights): + super().__init__() + self.rnn_stacks = nn.LayerList() + for i in range(num_stacks): + if use_gru: + #default:GRU using tanh + self.rnn_stacks.append( + BiGRUWithBN(i_size=i_size, h_size=h_size, act="relu")) + else: + self.rnn_stacks.append( + BiRNNWithBN( + i_size=i_size, + h_size=h_size, + share_weights=share_rnn_weights)) + i_size = h_size * 2 + + def forward(self, x, x_len): + """ + x: shape [B, T, D] + x_len: shpae [B] + """ + for i, rnn in enumerate(self.rnn_stacks): + x, x_len = rnn(x, x_len) + masks = sequence_mask(x_len) #[B, T] + masks = masks.unsqueeze(-1) # [B, T, 1] + x = x.multiply(masks) + return x, x_len diff --git a/tools/_init_paths.py b/deepspeech/training/__init__.py similarity index 69% rename from tools/_init_paths.py rename to deepspeech/training/__init__.py index c4b28c643..1071a3dd7 100644 --- a/tools/_init_paths.py +++ b/deepspeech/training/__init__.py @@ -11,19 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Set up paths for DS2""" -import os.path -import sys - - -def add_path(path): - if path not in sys.path: - sys.path.insert(0, path) - - -this_dir = os.path.dirname(__file__) - -# Add project path to PYTHONPATH -proj_path = os.path.join(this_dir, '..') -add_path(proj_path) +from deepspeech.training.trainer import * diff --git a/deepspeech/training/cli.py b/deepspeech/training/cli.py new file mode 100644 index 000000000..0994f71f5 --- /dev/null +++ b/deepspeech/training/cli.py @@ -0,0 +1,69 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + + +def default_argument_parser(): + r"""A simple yet genral argument parser for experiments with parakeet. + + This is used in examples with parakeet. And it is intended to be used by + other experiments with parakeet. It requires a minimal set of command line + arguments to start a training script. + + The ``--config`` and ``--opts`` are used for overwrite the deault + configuration. + + The ``--data`` and ``--output`` specifies the data path and output path. + Resuming training from existing progress at the output directory is the + intended default behavior. + + The ``--checkpoint_path`` specifies the checkpoint to load from. + + The ``--device`` and ``--nprocs`` specifies how to run the training. + + + See Also + -------- + parakeet.training.experiment + Returns + ------- + argparse.ArgumentParser + the parser + """ + parser = argparse.ArgumentParser() + + # yapf: disable + # data and output + parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.") + parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.") + # parser.add_argument("--data", metavar="DATA_DIR", help="path to the datatset.") + parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.") + + # load from saved checkpoint + parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load") + + # save jit model to + parser.add_argument("--export_path", type=str, help="path of the jit model to save") + + # running + parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"], help="device type to use, cpu and gpu are supported.") + parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.") + + # overwrite extra config and default config + #parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") + parser.add_argument("--opts", type=str, default=[], nargs='+', help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") + # yapd: enable + + return parser diff --git a/deepspeech/training/gradclip.py b/deepspeech/training/gradclip.py new file mode 100644 index 000000000..1693b76df --- /dev/null +++ b/deepspeech/training/gradclip.py @@ -0,0 +1,74 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import paddle +from paddle.fluid.dygraph import base as imperative_base +from paddle.fluid import layers +from paddle.fluid import core + +logger = logging.getLogger(__name__) + + +class MyClipGradByGlobalNorm(paddle.nn.ClipGradByGlobalNorm): + def __init__(self, clip_norm): + super().__init__(clip_norm) + + @imperative_base.no_grad + def _dygraph_clip(self, params_grads): + params_and_grads = [] + sum_square_list = [] + for p, g in params_grads: + if g is None: + continue + if getattr(p, 'need_clip', True) is False: + continue + merge_grad = g + if g.type == core.VarDesc.VarType.SELECTED_ROWS: + merge_grad = layers.merge_selected_rows(g) + merge_grad = layers.get_tensor_from_selected_rows(merge_grad) + square = layers.square(merge_grad) + sum_square = layers.reduce_sum(square) + logger.info( + f"Grad Before Clip: {p.name}: {float(layers.sqrt(layers.reduce_sum(layers.square(merge_grad))) ) }" + ) + sum_square_list.append(sum_square) + + # all parameters have been filterd out + if len(sum_square_list) == 0: + return params_grads + + global_norm_var = layers.concat(sum_square_list) + global_norm_var = layers.reduce_sum(global_norm_var) + global_norm_var = layers.sqrt(global_norm_var) + logger.info(f"Grad Global Norm: {float(global_norm_var)}!!!!") + max_global_norm = layers.fill_constant( + shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) + clip_var = layers.elementwise_div( + x=max_global_norm, + y=layers.elementwise_max(x=global_norm_var, y=max_global_norm)) + for p, g in params_grads: + if g is None: + continue + if getattr(p, 'need_clip', True) is False: + params_and_grads.append((p, g)) + continue + new_grad = layers.elementwise_mul(x=g, y=clip_var) + logger.info( + f"Grad After Clip: {p.name}: {float(layers.sqrt(layers.reduce_sum(layers.square(merge_grad))) ) }" + ) + params_and_grads.append((p, new_grad)) + + return params_and_grads diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py new file mode 100644 index 000000000..ebaed256b --- /dev/null +++ b/deepspeech/training/trainer.py @@ -0,0 +1,327 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import logging +import logging.handlers +from pathlib import Path +import numpy as np +from collections import defaultdict + +import paddle +from paddle import distributed as dist +from paddle.distributed.utils import get_gpus +from tensorboardX import SummaryWriter + +from deepspeech.utils import checkpoint +from deepspeech.utils import mp_tools + +__all__ = ["Trainer"] + + +class Trainer(): + """ + An experiment template in order to structure the training code and take + care of saving, loading, logging, visualization stuffs. It's intended to + be flexible and simple. + + So it only handles output directory (create directory for the output, + create a checkpoint directory, dump the config in use and create + visualizer and logger) in a standard way without enforcing any + input-output protocols to the model and dataloader. It leaves the main + part for the user to implement their own (setup the model, criterion, + optimizer, define a training step, define a validation function and + customize all the text and visual logs). + It does not save too much boilerplate code. The users still have to write + the forward/backward/update mannually, but they are free to add + non-standard behaviors if needed. + We have some conventions to follow. + 1. Experiment should have ``model``, ``optimizer``, ``train_loader`` and + ``valid_loader``, ``config`` and ``args`` attributes. + 2. The config should have a ``training`` field, which has + ``valid_interval``, ``save_interval`` and ``max_iteration`` keys. It is + used as the trigger to invoke validation, checkpointing and stop of the + experiment. + 3. There are four methods, namely ``train_batch``, ``valid``, + ``setup_model`` and ``setup_dataloader`` that should be implemented. + Feel free to add/overwrite other methods and standalone functions if you + need. + + Parameters + ---------- + config: yacs.config.CfgNode + The configuration used for the experiment. + + args: argparse.Namespace + The parsed command line arguments. + Examples + -------- + >>> def main_sp(config, args): + >>> exp = Trainer(config, args) + >>> exp.setup() + >>> exp.run() + >>> + >>> config = get_cfg_defaults() + >>> parser = default_argument_parser() + >>> args = parser.parse_args() + >>> if args.config: + >>> config.merge_from_file(args.config) + >>> if args.opts: + >>> config.merge_from_list(args.opts) + >>> config.freeze() + >>> + >>> if args.nprocs > 1 and args.device == "gpu": + >>> dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) + >>> else: + >>> main_sp(config, args) + """ + + def __init__(self, config, args): + self.config = config + self.args = args + self.optimizer = None + self.visualizer = None + self.output_dir = None + self.checkpoint_dir = None + self.logger = None + + def setup(self): + """Setup the experiment. + """ + paddle.set_device(self.args.device) + if self.parallel: + self.init_parallel() + + self.setup_output_dir() + self.dump_config() + self.setup_visualizer() + self.setup_logger() + self.setup_checkpointer() + + self.setup_dataloader() + self.setup_model() + + self.iteration = 0 + self.epoch = 0 + + @property + def parallel(self): + """A flag indicating whether the experiment should run with + multiprocessing. + """ + return self.args.device == "gpu" and self.args.nprocs > 1 + + def init_parallel(self): + """Init environment for multiprocess training. + """ + dist.init_parallel_env() + + @mp_tools.rank_zero_only + def save(self): + """Save checkpoint (model parameters and optimizer states). + """ + checkpoint.save_parameters(self.checkpoint_dir, self.iteration, + self.model, self.optimizer) + + def resume_or_load(self): + """Resume from latest checkpoint at checkpoints in the output + directory or load a specified checkpoint. + + If ``args.checkpoint_path`` is not None, load the checkpoint, else + resume training. + """ + iteration = checkpoint.load_parameters( + self.model, + self.optimizer, + checkpoint_dir=self.checkpoint_dir, + checkpoint_path=self.args.checkpoint_path) + self.iteration = iteration + + def new_epoch(self): + """Reset the train loader and increment ``epoch``. + """ + if self.parallel: + # batch sampler epoch start from 0 + self.train_loader.batch_sampler.set_epoch(self.epoch) + self.epoch += 1 + + def train(self): + """The training process. + + It includes forward/backward/update and periodical validation and + saving. + """ + self.logger.info( + f"Train Total Examples: {len(self.train_loader.dataset)}") + self.new_epoch() + while self.epoch <= self.config.training.n_epoch: + try: + for batch in self.train_loader: + self.iteration += 1 + self.train_batch(batch) + except Exception as e: + self.logger.error(e) + pass + + self.valid() + self.save() + self.lr_scheduler.step() + self.new_epoch() + + def run(self): + """The routine of the experiment after setup. This method is intended + to be used by the user. + """ + self.resume_or_load() + try: + self.train() + except KeyboardInterrupt: + self.save() + exit(-1) + finally: + self.destory() + + def setup_output_dir(self): + """Create a directory used for output. + """ + # output dir + output_dir = Path(self.args.output).expanduser() + output_dir.mkdir(parents=True, exist_ok=True) + + self.output_dir = output_dir + + def setup_checkpointer(self): + """Create a directory used to save checkpoints into. + + It is "checkpoints" inside the output directory. + """ + # checkpoint dir + checkpoint_dir = self.output_dir / "checkpoints" + checkpoint_dir.mkdir(exist_ok=True) + + self.checkpoint_dir = checkpoint_dir + + @mp_tools.rank_zero_only + def destory(self): + # https://github.com/pytorch/fairseq/issues/2357 + if self.visualizer: + self.visualizer.close() + + @mp_tools.rank_zero_only + def setup_visualizer(self): + """Initialize a visualizer to log the experiment. + + The visual log is saved in the output directory. + + Notes + ------ + Only the main process has a visualizer with it. Use multiple + visualizers in multiprocess to write to a same log file may cause + unexpected behaviors. + """ + # visualizer + visualizer = SummaryWriter(logdir=str(self.output_dir)) + + self.visualizer = visualizer + + def setup_logger(self): + """Initialize a text logger to log the experiment. + + Each process has its own text logger. The logging message is write to + the standard output and a text file named ``worker_n.log`` in the + output directory, where ``n`` means the rank of the process. + when - how to split the log file by time interval + 'S' : Seconds + 'M' : Minutes + 'H' : Hours + 'D' : Days + 'W' : Week day + default value: 'D' + format - format of the log + default format: + %(levelname)s: %(asctime)s: %(filename)s:%(lineno)d * %(thread)d %(message)s + INFO: 12-09 18:02:42: log.py:40 * 139814749787872 HELLO WORLD + backup - how many backup file to keep + default value: 7 + """ + when = 'D' + backup = 7 + format = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s' + formatter = logging.Formatter(fmt=format, datefmt='%Y/%m/%d %H:%M:%S') + + logger = logging.getLogger(__name__) + logger.setLevel("INFO") + + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + log_file = self.output_dir / 'worker_{}.log'.format(dist.get_rank()) + # file_handler = logging.FileHandler(str(log_file)) + # file_handler.setFormatter(formatter) + # logger.addHandler(file_handler) + + # handler = logging.handlers.TimedRotatingFileHandler( + # str(self.output_dir / "warning.log"), when=when, backupCount=backup) + # handler.setLevel(logging.WARNING) + # handler.setFormatter(formatter) + # logger.addHandler(handler) + + # stop propagate for propagating may print + # log multiple times + logger.propagate = False + + # global logger + stdout = False + save_path = log_file + logging.basicConfig( + level=logging.DEBUG if stdout else logging.INFO, + format=format, + datefmt='%Y/%m/%d %H:%M:%S', + filename=save_path if not stdout else None) + self.logger = logger + + @mp_tools.rank_zero_only + def dump_config(self): + """Save the configuration used for this experiment. + + It is saved in to ``config.yaml`` in the output directory at the + beginning of the experiment. + """ + with open(self.output_dir / "config.yaml", 'wt') as f: + print(self.config, file=f) + + def train_batch(self): + """The training loop. A subclass should implement this method. + """ + raise NotImplementedError("train_batch should be implemented.") + + @mp_tools.rank_zero_only + @paddle.no_grad() + def valid(self): + """The validation. A subclass should implement this method. + """ + raise NotImplementedError("valid should be implemented.") + + def setup_model(self): + """Setup model, criterion and optimizer, etc. A subclass should + implement this method. + """ + raise NotImplementedError("setup_model should be implemented.") + + def setup_dataloader(self): + """Setup training dataloader and validation dataloader. A subclass + should implement this method. + """ + raise NotImplementedError("setup_dataloader should be implemented.") diff --git a/deepspeech/utils/__init__.py b/deepspeech/utils/__init__.py new file mode 100644 index 000000000..185a92b8d --- /dev/null +++ b/deepspeech/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py new file mode 100644 index 000000000..f2066fdec --- /dev/null +++ b/deepspeech/utils/checkpoint.py @@ -0,0 +1,140 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import logging +import numpy as np + +import paddle +from paddle import distributed as dist +from paddle.nn import Layer +from paddle.optimizer import Optimizer + +from deepspeech.utils import mp_tools + +logger = logging.getLogger(__name__) + +__all__ = ["load_parameters", "save_parameters"] + + +def _load_latest_checkpoint(checkpoint_dir: str) -> int: + """Get the iteration number corresponding to the latest saved checkpoint. + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + Returns: + int: the latest iteration number. + """ + checkpoint_record = os.path.join(checkpoint_dir, "checkpoint") + if (not os.path.isfile(checkpoint_record)): + return 0 + + # Fetch the latest checkpoint index. + with open(checkpoint_record, "rt") as handle: + latest_checkpoint = handle.readlines()[-1].strip() + step = latest_checkpoint.split(":")[-1] + iteration = int(step.split("-")[-1]) + + return iteration + + +def _save_checkpoint(checkpoint_dir: str, iteration: int): + """Save the iteration number of the latest model to be checkpointed. + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + iteration (int): the latest iteration number. + Returns: + None + """ + checkpoint_record = os.path.join(checkpoint_dir, "checkpoint") + # Update the latest checkpoint index. + with open(checkpoint_record, "a+") as handle: + handle.write("model_checkpoint_path:step-{}\n".format(iteration)) + + +def load_parameters(model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): + """Load a specific model checkpoint from disk. + Args: + model (Layer): model to load parameters. + optimizer (Optimizer, optional): optimizer to load states if needed. + Defaults to None. + checkpoint_dir (str, optional): the directory where checkpoint is saved. + checkpoint_path (str, optional): if specified, load the checkpoint + stored in the checkpoint_path and the argument 'checkpoint_dir' will + be ignored. Defaults to None. + Returns: + iteration (int): number of iterations that the loaded checkpoint has + been trained. + """ + if checkpoint_path is not None: + iteration = int(os.path.basename(checkpoint_path).split("-")[-1]) + elif checkpoint_dir is not None: + iteration = _load_latest_checkpoint(checkpoint_dir) + if iteration == 0: + return iteration + checkpoint_path = os.path.join(checkpoint_dir, + "step-{}".format(iteration)) + else: + raise ValueError( + "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" + ) + + rank = dist.get_rank() + + params_path = checkpoint_path + ".pdparams" + model_dict = paddle.load(params_path) + model.set_state_dict(model_dict) + logger.info( + "[checkpoint] Rank {}: loaded model from {}".format(rank, params_path)) + + optimizer_path = checkpoint_path + ".pdopt" + if optimizer and os.path.isfile(optimizer_path): + optimizer_dict = paddle.load(optimizer_path) + optimizer.set_state_dict(optimizer_dict) + logger.info("[checkpoint] Rank {}: loaded optimizer state from {}". + format(rank, optimizer_path)) + + return iteration + + +@mp_tools.rank_zero_only +def save_parameters(checkpoint_dir, iteration, model, optimizer=None): + """Checkpoint the latest trained model parameters. + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + iteration (int): the latest iteration number. + model (Layer): model to be checkpointed. + optimizer (Optimizer, optional): optimizer to be checkpointed. + Defaults to None. + Returns: + None + """ + checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration)) + + model_dict = model.state_dict() + params_path = checkpoint_path + ".pdparams" + paddle.save(model_dict, params_path) + logger.info("[checkpoint] Saved model to {}".format(params_path)) + + if optimizer: + opt_dict = optimizer.state_dict() + optimizer_path = checkpoint_path + ".pdopt" + paddle.save(opt_dict, optimizer_path) + logger.info( + "[checkpoint] Saved optimzier state to {}".format(optimizer_path)) + + _save_checkpoint(checkpoint_dir, iteration) diff --git a/utils/error_rate.py b/deepspeech/utils/error_rate.py similarity index 99% rename from utils/error_rate.py rename to deepspeech/utils/error_rate.py index d80546ee2..3fb6b769c 100644 --- a/utils/error_rate.py +++ b/deepspeech/utils/error_rate.py @@ -14,9 +14,10 @@ """This module provides functions to calculate error rate in different level. e.g. wer for word-level, cer for char-level. """ - import numpy as np +__all__ = ['word_errors', 'char_errors', 'wer', 'cer'] + def _levenshtein_distance(ref, hyp): """Levenshtein distance is a string metric for measuring the difference diff --git a/deepspeech/utils/layer_tools.py b/deepspeech/utils/layer_tools.py new file mode 100644 index 000000000..46a354761 --- /dev/null +++ b/deepspeech/utils/layer_tools.py @@ -0,0 +1,78 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from paddle import nn + +__all__ = [ + "summary", "gradient_norm", "freeze", "unfreeze", "print_grads", + "print_params" +] + + +def summary(layer: nn.Layer, print_func=print): + num_params = num_elements = 0 + print_func("layer summary:") + for name, param in layer.state_dict().items(): + print_func("{}|{}|{}".format(name, param.shape, np.prod(param.shape))) + num_elements += np.prod(param.shape) + num_params += 1 + print_func("layer has {} parameters, {} elements.".format(num_params, + num_elements)) + + +def gradient_norm(layer: nn.Layer): + grad_norm_dict = {} + for name, param in layer.state_dict().items(): + if param.trainable: + grad = param.gradient() + grad_norm_dict[name] = np.linalg.norm(grad) / grad.size + return grad_norm_dict + + +def recursively_remove_weight_norm(layer: nn.Layer): + for layer in layer.sublayers(): + try: + nn.utils.remove_weight_norm(layer) + except: + # ther is not weight norm hoom in this layer + pass + + +def freeze(layer: nn.Layer): + for param in layer.parameters(): + param.trainable = False + + +def unfreeze(layer: nn.Layer): + for param in layer.parameters(): + param.trainable = True + + +def print_grads(model, print_func=print): + for n, p in model.named_parameters(): + msg = f"param grad: {n}: shape: {p.shape} grad: {p.grad}" + if print_func: + print_func(msg) + + +def print_params(model, print_func=print): + total = 0.0 + for n, p in model.named_parameters(): + msg = f"param: {n}: shape: {p.shape} stop_grad: {p.stop_gradient}" + total += np.prod(p.shape) + if print_func: + print_func(msg) + if print_func: + print_func(f"Total parameters: {total}!") diff --git a/deepspeech/utils/mp_tools.py b/deepspeech/utils/mp_tools.py new file mode 100644 index 000000000..9c3c3d548 --- /dev/null +++ b/deepspeech/utils/mp_tools.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import distributed as dist +from functools import wraps + +__all__ = ["rank_zero_only"] + + +def rank_zero_only(func): + @wraps(func) + def wrapper(*args, **kwargs): + rank = dist.get_rank() + if rank != 0: + return + result = func(*args, **kwargs) + return result + + return wrapper diff --git a/deepspeech/utils/socket_server.py b/deepspeech/utils/socket_server.py new file mode 100644 index 000000000..2a0a62d01 --- /dev/null +++ b/deepspeech/utils/socket_server.py @@ -0,0 +1,111 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +import time +from time import gmtime, strftime +import socketserver +import struct +import wave + +from deepspeech.frontend.utility import read_manifest + +__all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"] + + +def socket_send(server_ip: str, server_port: str, data: bytes): + # Connect to server and send data + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect((server_ip, server_port)) + sent = data + sock.sendall(struct.pack('>i', len(sent)) + sent) + print('Speech[length=%d] Sent.' % len(sent)) + # Receive data from the server and shut down + received = sock.recv(1024) + print("Recognition Results: {}".format(received.decode('utf8'))) + sock.close() + + +def warm_up_test(audio_process_handler, + manifest_path, + num_test_cases, + random_seed=0): + """Warming-up test.""" + manifest = read_manifest(manifest_path) + rng = random.Random(random_seed) + samples = rng.sample(manifest, num_test_cases) + for idx, sample in enumerate(samples): + print("Warm-up Test Case %d: %s", idx, sample['audio_filepath']) + start_time = time.time() + transcript = audio_process_handler(sample['audio_filepath']) + finish_time = time.time() + print("Response Time: %f, Transcript: %s" % + (finish_time - start_time, transcript)) + + +class AsrTCPServer(socketserver.TCPServer): + """The ASR TCP Server.""" + + def __init__(self, + server_address, + RequestHandlerClass, + speech_save_dir, + audio_process_handler, + bind_and_activate=True): + self.speech_save_dir = speech_save_dir + self.audio_process_handler = audio_process_handler + socketserver.TCPServer.__init__( + self, server_address, RequestHandlerClass, bind_and_activate=True) + + +class AsrRequestHandler(socketserver.BaseRequestHandler): + """The ASR request handler.""" + + def handle(self): + # receive data through TCP socket + chunk = self.request.recv(1024) + target_len = struct.unpack('>i', chunk[:4])[0] + data = chunk[4:] + while len(data) < target_len: + chunk = self.request.recv(1024) + data += chunk + # write to file + filename = self._write_to_file(data) + + print("Received utterance[length=%d] from %s, saved to %s." % + (len(data), self.client_address[0], filename)) + start_time = time.time() + transcript = self.server.audio_process_handler(filename) + finish_time = time.time() + print("Response Time: %f, Transcript: %s" % + (finish_time - start_time, transcript)) + self.request.sendall(transcript.encode('utf-8')) + + def _write_to_file(self, data): + # prepare save dir and filename + if not os.path.exists(self.server.speech_save_dir): + os.mkdir(self.server.speech_save_dir) + timestamp = strftime("%Y%m%d%H%M%S", gmtime()) + out_filename = os.path.join( + self.server.speech_save_dir, + timestamp + "_" + self.client_address[0] + ".wav") + # write to wav file + file = wave.open(out_filename, 'wb') + file.setnchannels(1) + file.setsampwidth(2) + file.setframerate(16000) + file.writeframes(data) + file.close() + return out_filename diff --git a/deepspeech/utils/utility.py b/deepspeech/utils/utility.py new file mode 100644 index 000000000..72a45e29a --- /dev/null +++ b/deepspeech/utils/utility.py @@ -0,0 +1,60 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains common utility functions.""" + +import numpy as np +import distutils.util + +__all__ = ['print_arguments', 'add_arguments'] + + +def print_arguments(args): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).items()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) \ No newline at end of file diff --git a/deploy/demo_server.py b/deploy/demo_server.py deleted file mode 100644 index bfc48c9f1..000000000 --- a/deploy/demo_server.py +++ /dev/null @@ -1,251 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Server-end for the ASR demo.""" -import os -import time -import random -import argparse -import functools -from time import gmtime, strftime -import SocketServer -import struct -import wave -import paddle.fluid as fluid -import numpy as np -import _init_paths -from data_utils.data import DataGenerator -from model_utils.model import DeepSpeech2Model -from data_utils.utility import read_manifest -from utils.utility import add_arguments, print_arguments - -parser = argparse.ArgumentParser(description=__doc__) -add_arg = functools.partial(add_arguments, argparser=parser) -# yapf: disable -add_arg('host_port', int, 8086, "Server's IP port.") -add_arg('beam_size', int, 500, "Beam search width.") -add_arg('num_conv_layers', int, 2, "# of convolution layers.") -add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") -add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") -add_arg('alpha', float, 2.5, "Coef of LM for beam search.") -add_arg('beta', float, 0.3, "Coef of WC for beam search.") -add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.") -add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.") -add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") -add_arg('use_gpu', bool, True, "Use GPU or not.") -add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " - "bi-directional RNNs. Not for GRU.") -add_arg('host_ip', str, - 'localhost', - "Server's IP address.") -add_arg('speech_save_dir', str, - 'demo_cache', - "Directory to save demo audios.") -add_arg('warmup_manifest', str, - 'data/librispeech/manifest.test-clean', - "Filepath of manifest to warm up.") -add_arg('mean_std_path', str, - 'data/librispeech/mean_std.npz', - "Filepath of normalizer's mean & std.") -add_arg('vocab_path', str, - 'data/librispeech/eng_vocab.txt', - "Filepath of vocabulary.") -add_arg('model_path', str, - './checkpoints/libri/step_final', - "If None, the training starts from scratch, " - "otherwise, it resumes from the pre-trained model.") -add_arg('lang_model_path', str, - 'lm/data/common_crawl_00.prune01111.trie.klm', - "Filepath for language model.") -add_arg('decoding_method', str, - 'ctc_beam_search', - "Decoding method. Options: ctc_beam_search, ctc_greedy", - choices = ['ctc_beam_search', 'ctc_greedy']) -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -# yapf: disable -args = parser.parse_args() - - -class AsrTCPServer(SocketServer.TCPServer): - """The ASR TCP Server.""" - - def __init__(self, - server_address, - RequestHandlerClass, - speech_save_dir, - audio_process_handler, - bind_and_activate=True): - self.speech_save_dir = speech_save_dir - self.audio_process_handler = audio_process_handler - SocketServer.TCPServer.__init__( - self, server_address, RequestHandlerClass, bind_and_activate=True) - - -class AsrRequestHandler(SocketServer.BaseRequestHandler): - """The ASR request handler.""" - - def handle(self): - # receive data through TCP socket - chunk = self.request.recv(1024) - target_len = struct.unpack('>i', chunk[:4])[0] - data = chunk[4:] - while len(data) < target_len: - chunk = self.request.recv(1024) - data += chunk - # write to file - filename = self._write_to_file(data) - - print("Received utterance[length=%d] from %s, saved to %s." % - (len(data), self.client_address[0], filename)) - start_time = time.time() - transcript = self.server.audio_process_handler(filename) - finish_time = time.time() - print("Response Time: %f, Transcript: %s" % - (finish_time - start_time, transcript)) - self.request.sendall(transcript.encode('utf-8')) - - def _write_to_file(self, data): - # prepare save dir and filename - if not os.path.exists(self.server.speech_save_dir): - os.mkdir(self.server.speech_save_dir) - timestamp = strftime("%Y%m%d%H%M%S", gmtime()) - out_filename = os.path.join( - self.server.speech_save_dir, - timestamp + "_" + self.client_address[0] + ".wav") - # write to wav file - file = wave.open(out_filename, 'wb') - file.setnchannels(1) - file.setsampwidth(4) - file.setframerate(16000) - file.writeframes(data) - file.close() - return out_filename - - -def warm_up_test(audio_process_handler, - manifest_path, - num_test_cases, - random_seed=0): - """Warming-up test.""" - manifest = read_manifest(manifest_path) - rng = random.Random(random_seed) - samples = rng.sample(manifest, num_test_cases) - for idx, sample in enumerate(samples): - print("Warm-up Test Case %d: %s", idx, sample['audio_filepath']) - start_time = time.time() - transcript = audio_process_handler(sample['audio_filepath']) - finish_time = time.time() - print("Response Time: %f, Transcript: %s" % - (finish_time - start_time, transcript)) - - -def start_server(): - """Start the ASR server""" - # prepare data generator - if args.use_gpu: - place = fluid.CUDAPlace(0) - else: - place = fluid.CPUPlace() - - data_generator = DataGenerator( - vocab_filepath=args.vocab_path, - mean_std_filepath=args.mean_std_path, - augmentation_config='{}', - specgram_type=args.specgram_type, - keep_transcription_text=True, - place = place, - is_training = False) - # prepare ASR model - ds2_model = DeepSpeech2Model( - vocab_size=data_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_layer_size=args.rnn_layer_size, - use_gru=args.use_gru, - init_from_pretrained_model=args.model_path, - place=place, - share_rnn_weights=args.share_rnn_weights) - - vocab_list = [chars for chars in data_generator.vocab_list] - - if args.decoding_method == "ctc_beam_search": - ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, - vocab_list) - # prepare ASR inference handler - def file_to_transcript(filename): - feature = data_generator.process_utterance(filename, "") - audio_len = feature[0].shape[1] - mask_shape0 = (feature[0].shape[0] - 1) // 2 + 1 - mask_shape1 = (feature[0].shape[1] - 1) // 3 + 1 - mask_max_len = (audio_len - 1) // 3 + 1 - mask_ones = np.ones((mask_shape0, mask_shape1)) - mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1)) - mask = np.repeat( - np.reshape( - np.concatenate((mask_ones, mask_zeros), axis=1), - (1, mask_shape0, mask_max_len)), - 32, - axis=0) - feature = (np.array([feature[0]]).astype('float32'), - None, - np.array([audio_len]).astype('int64').reshape([-1,1]), - np.array([mask]).astype('float32')) - probs_split = ds2_model.infer_batch_probs( - infer_data=feature, - feeding_dict=data_generator.feeding) - - if args.decoding_method == "ctc_greedy": - result_transcript = ds2_model.decode_batch_greedy( - probs_split=probs_split, - vocab_list=vocab_list) - else: - result_transcript = ds2_model.decode_batch_beam_search( - probs_split=probs_split, - beam_alpha=args.alpha, - beam_beta=args.beta, - beam_size=args.beam_size, - cutoff_prob=args.cutoff_prob, - cutoff_top_n=args.cutoff_top_n, - vocab_list=vocab_list, - num_processes=1) - return result_transcript[0] - - # warming up with utterrances sampled from Librispeech - print('-----------------------------------------------------------') - print('Warming up ...') - warm_up_test( - audio_process_handler=file_to_transcript, - manifest_path=args.warmup_manifest, - num_test_cases=3) - print('-----------------------------------------------------------') - - # start the server - server = AsrTCPServer( - server_address=(args.host_ip, args.host_port), - RequestHandlerClass=AsrRequestHandler, - speech_save_dir=args.speech_save_dir, - audio_process_handler=file_to_transcript) - print("ASR Server Started.") - server.serve_forever() - - -def main(): - print_arguments(args) - start_server() - - -if __name__ == "__main__": - main() diff --git a/docs/README_cn_old.md b/docs/README_cn_old.md new file mode 100644 index 000000000..375d588dc --- /dev/null +++ b/docs/README_cn_old.md @@ -0,0 +1,508 @@ +# 语音识别: DeepSpeech2 + +[English](README.md) + +*DeepSpeech2*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别(ASR)引擎的开源项目,具体原理参考这篇论文[Baidu's Deep Speech 2 paper](http://proceedings.mlr.press/v48/amodei16.pdf)。 +我们的愿景是为语音识别在工业应用和学术研究上,提供易于使用、高效和可扩展的工具,包括训练,推理,测试模块,以及 demo 部署。同时,我们还将发布一些预训练好的英语和普通话模型。 + +## 目录 +- [安装](#安装) +- [在 Docker 容器上运行](#在Docker容器上运行) +- [开始](#开始) +- [数据准备](#数据准备) +- [训练模型](#训练模型) +- [数据增强流水线](#数据增强流水线) +- [推断和评价](#推断和评价) +- [超参数调整](#超参数调整) +- [训练汉语语言](#训练汉语语言) +- [用自己的声音尝试现场演示](#用自己的声音尝试现场演示) +- [发布模型](#发布模型) +- [试验和基准](#试验和基准) +- [问题和帮助](#问题和帮助) + +## 安装 +为了避免环境配置问题,强烈建议在[Docker容器上运行](#在Docker容器上运行),否则请按照下面的指南安装依赖项。 + +### 前提 +- Python >= 3.7 +- PaddlePaddle 1.8.5 版本及以上(请参考[安装指南](https://www.paddlepaddle.org.cn/install/quick)) + +### 安装 +- 请确保以下库或工具已安装完毕:`pkg-config`, `flac`, `ogg`, `vorbis`, `boost` 和 `swig`, 如可以通过`apt-get`安装: + +```bash +sudo apt-get install -y pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev +``` + +或者,也可以通过`yum`安装: + +```bash +sudo yum install pkgconfig libogg-devel libvorbis-devel boost-devel python3-devel +wget https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.1.tar.xz +xz -d flac-1.3.1.tar.xz +tar -xvf flac-1.3.1.tar +cd flac-1.3.1 +./configure +make +make install +``` + +- 运行脚本安装其余的依赖项 + +```bash +git clone https://github.com/PaddlePaddle/DeepSpeech.git +cd DeepSpeech +pushd tools; make; popd +source tools/venv/bin/activate +sh setup.sh +``` + +- Source venv before do experiment. + +```bash +source tools/venv/bin/activate +``` + + +### 在Docker容器上运行 + +Docker 是一个开源工具,用于在孤立的环境中构建、发布和运行分布式应用程序。此项目的 Docker 镜像已在[hub.docker.com](https://hub.docker.com)中提供,并安装了所有依赖项,其中包括预先构建的PaddlePaddle,CTC解码器以及其他必要的 Python 和第三方库。这个 Docker 映像需要NVIDIA GPU的支持,所以请确保它的可用性并已完成[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)的安装。 + +采取以下步骤来启动 Docker 镜像: + +- 下载 Docker 镜像 + +```bash +nvidia-docker pull hub.baidubce.com/paddlepaddle/deep_speech_fluid:latest-gpu +``` + +- git clone 这个资源库 + +``` +git clone https://github.com/PaddlePaddle/DeepSpeech.git +``` + +- 运行 Docker 镜像 + +```bash +sudo nvidia-docker run -it -v $(pwd)/DeepSpeech:/DeepSpeech hub.baidubce.com/paddlepaddle/deep_speech_fluid:latest-gpu /bin/bash +``` + +现在返回并从[开始](#开始)部分开始,您可以在Docker容器中同样执行模型训练,推断和超参数调整。 + +- 安装 PaddlePaddle + +例如 CUDA 10.1, CuDNN7.5: +```bash +python3 -m pip install paddlepaddle-gpu==1.8.5.post107 +``` + +## 开始 + +`./examples`里的一些 shell 脚本将帮助我们在一些公开数据集(比如:[LibriSpeech](http://www.openslr.org/12/), [Aishell](http://www.openslr.org/33)) 进行快速尝试,包括了数据准备,模型训练,案例推断和模型评价。阅读这些例子将帮助你理解如何使用你的数据集训练模型。 + +`./examples`目录中的一些脚本配置使用了 8 个 GPU。如果你没有 8 个可用的 GPU,请修改环境变量`CUDA_VISIBLE_DEVICES`。如果你没有可用的 GPU,请设置`--use_gpu`为 False,这样程序会用 CPU 代替 GPU。另外如果发生内存不足的问题,减小`--batch_size`即可。 + +让我们先看看[LibriSpeech dataset](http://www.openslr.org/12/)小样本集的例子。 + +- 进入目录 + + ```bash + cd examples/tiny + ``` + + 注意这仅仅是 LibriSpeech 一个小数据集的例子。如果你想尝试完整的数据集(可能需要花好几天来训练模型),请使用这个路径`examples/librispeech`。 +- 准备数据 + + ```bash + sh run_data.sh + ``` + + 运行`run_data.sh`脚本将会下载数据集,产出 manifests 文件,收集一些归一化需要的统计信息并建立词表。当数据准备完成之后,下载完的数据(仅有 LibriSpeech 一部分)在`dataset/librispeech`中;其对应的 manifest 文件,均值标准差和词表文件在`./data/tiny`中。在第一次执行的时候一定要执行这个脚本,在接下来所有的实验中我们都会用到这个数据集。 +- 训练你自己的 ASR 模型 + + ```bash + sh run_train.sh + ``` + + `run_train.sh`将会启动训练任务,训练日志会打印到终端,并且模型每个 epoch 的 checkpoint 都会保存到`./checkpoints/tiny`目录中。这些 checkpoint 可以用来恢复训练,推断,评价和部署。 +- 用已有的模型进行案例推断 + + ```bash + sh run_infer.sh + ``` + + `run_infer.sh`将会利用训练好的模型展现一些(默认 10 个)样本语音到文本的解码结果。由于当前模型只使用了 LibriSpeech 一部分数据集训练,因此性能可能不会太好。为了看到更好模型上的表现,你可以下载一个已训练好的模型(用完整的 LibriSpeech 训练了好几天)来做推断。 + + ```bash + sh run_infer_golden.sh + ``` +- 评价一个已经存在的模型 + + ```bash + sh run_test.sh + ``` + + `run_test.sh`能够利用误字率(或字符错误率)来评价模型。类似的,你可以下载一个完全训练好的模型来测试它的性能: + + ```bash + sh run_test_golden.sh + ``` + +更多细节会在接下来的章节中阐述。祝你在*DeepSpeech2*ASR引擎学习中过得愉快! + + +## 数据准备 + +### 生成Manifest + +*DeepSpeech2*接受文本**manifest**文件作为数据接口。manifest 文件包含了一系列语音数据,其中每一行代表一个[JSON](http://www.json.org/)格式的音频元数据(比如文件路径,描述,时长)。具体格式如下: + +``` +{"audio_filepath": "/home/work/.cache/paddle/Libri/134686/1089-134686-0001.flac", "duration": 3.275, "text": "stuff it into you his belly counselled him"} +{"audio_filepath": "/home/work/.cache/paddle/Libri/134686/1089-134686-0007.flac", "duration": 4.275, "text": "a cold lucid indifference reigned in his soul"} +``` + +如果你要使用自定义数据,你只需要按照以上格式生成自己的 manifest 文件即可。给定 manifest 文件,训练、推断以及其它所有模块都能够访问到音频数据以及对应的时长和标签数据。 + +关于如何生成 manifest 文件,请参考`data/librispeech/librispeech.py`。该脚本将会下载 LibriSpeech 数据集并生成 manifest 文件。 + +### 计算均值和标准差用于归一化 + +为了对音频特征进行 z-score 归一化(零均值,单位标准差),我们必须预估训练样本特征的均值和标准差: + +```bash +python3 tools/compute_mean_std.py \ +--num_samples 2000 \ +--specgram_type linear \ +--manifest_path data/librispeech/manifest.train \ +--output_path data/librispeech/mean_std.npz +``` + +以上这段代码会计算在`data/librispeech/manifest.train`路径中,2000 个随机采样的语音频谱特征的均值和标准差,并将结果保存在`data/librispeech/mean_std.npz`中,方便以后使用。 + +### 建立词表 + +我们需要一个包含可能会出现的字符集合的词表来在训练的时候将字符转换成索引,并在解码的时候将索引转换回文本。`tools/build_vocab.py`脚本将生成这种基于字符的词表。 + +```bash +python3 tools/build_vocab.py \ +--count_threshold 0 \ +--vocab_path data/librispeech/eng_vocab.txt \ +--manifest_paths data/librispeech/manifest.train +``` + +它将`data/librispeech/manifest.train`目录中的所有录音文本写入词表文件`data/librispeeech/eng_vocab.txt`,并且没有词汇截断(`--count_threshold 0`)。 + +### 更多帮助 + +获得更多帮助: + +```bash +python3 data/librispeech/librispeech.py --help +python3 tools/compute_mean_std.py --help +python3 tools/build_vocab.py --help +``` + +## 训练模型 + +`train.py`是训练模块的主要调用者。使用示例如下。 + +- 开始使用 8 片 GPU 训练: + + ``` + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train.py + ``` + +- 开始使用 CPU 训练: + + ``` + python3 train.py --use_gpu False + ``` + +- 从检查点恢复训练: + + ``` + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + python3 train.py \ + --init_from_pretrained_model CHECKPOINT_PATH_TO_RESUME_FROM + ``` + +获得更多帮助: + +```bash +python3 train.py --help +``` +或参考 `example/librispeech/run_train.sh`. + + +## 数据增强流水线 + +数据增强是用来提升深度学习性能的非常有效的技术。我们通过在原始音频中添加小的随机扰动(标签不变转换)获得新音频来增强我们的语音数据。你不必自己合成,因为数据增强已经嵌入到数据生成器中并且能够即时完成,在训练模型的每个epoch中随机合成音频。 + +目前提供六个可选的增强组件供选择,配置并插入处理过程。 + + - 音量扰动 + - 速度扰动 + - 移动扰动 + - 在线贝叶斯归一化 + - 噪声干扰(需要背景噪音的音频文件) + - 脉冲响应(需要脉冲音频文件) + +为了让训练模块知道需要哪些增强组件以及它们的处理顺序,我们需要事先准备一个[JSON](http://www.json.org/)格式的*扩展配置文件*。例如: + +``` +[{ + "type": "speed", + "params": {"min_speed_rate": 0.95, + "max_speed_rate": 1.05}, + "prob": 0.6 +}, +{ + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 0.8 +}] +``` + +当`trainer.py`的`--augment_conf_file`参数被设置为上述示例配置文件的路径时,每个 epoch 中的每个音频片段都将被处理。首先,均匀随机采样速率会有60%的概率在 0.95 和 1.05 之间对音频片段进行速度扰动。然后,音频片段有 80% 的概率在时间上被挪移,挪移偏差值是 -5 毫秒和 5 毫秒之间的随机采样。最后,这个新合成的音频片段将被传送给特征提取器,以用于接下来的训练。 + +有关其他配置实例,请参考`conf/augmenatation.config.example`. + +使用数据增强技术时要小心,由于扩大了训练和测试集的差异,不恰当的增强会对训练模型不利,导致训练和预测的差距增大。 + +## 推断和评价 + +### 准备语言模型 + +提升解码器的性能需要准备语言模型。我们准备了两种语言模型(有损压缩)供用户下载和尝试。一个是英语模型,另一个是普通话模型。用户可以执行以下命令来下载已经训练好的语言模型: + +```bash +cd models/lm +bash download_lm_en.sh +bash download_lm_ch.sh +``` + +如果你想训练自己更好的语言模型,请参考[KenLM](https://github.com/kpu/kenlm)获取教程。在这里,我们提供一些技巧来展示我们如何准备我们的英语和普通话模型。当你训练自己的模型的时候,可以参考这些技巧。 + + +#### 英语语言模型 + +英语语料库来自[Common Crawl Repository](http://commoncrawl.org),你可以从[statmt](http://data.statmt.org/ngrams/deduped_en)下载它。我们使用en.00部分来训练我们的英语语言模型。训练前有如下的一些预处理过程: + + * 不在\['A-Za-z0-9\s'\](\s表示空白字符)中的字符将被删除,阿拉伯数字被转换为英文数字,比如“1000”转换为 one thousand。 + * 重复的空白字符被压缩为一个,并且开始的空白字符将被删除。请注意,所有的录音都是小写字母,因此所有字符都转换为小写字母。 + * 选择前 40 万个最常用的单词来建立词表,其余部分将被替换为“UNKNOWNWORD”。 + +现在预处理完成了,我们得到一个干净的语料库来训练语言模型。我们发布的语言模型版本使用了参数“-o 5 --prune 0 1 1 1 1”来训练。“-o 5”表示语言模型的最大order为 5。“--prune 0 1 1 1 1”表示每个 order 的计数阈值,更具体地说,它将第 2 个以及更高的 order 修剪为单个。为了节省磁盘存储空间,我们将使用参数“-a 22 -q 8 -b 8”将 arpa 文件转换为“trie”二进制文件。“-a”表示在“trie”中用于切分的指针的最高位数。“-q -b”是概率和退避的量化参数。 + +#### 普通话语言模型 + +与英语语言模型不同的是,普通话语言模型是基于字符的,其中每一位都是中文汉字。我们使用内部语料库来训练发布的汉语语言模型。该语料库包含数十亿汉字。预处理阶段与英语语言模型有一些小的差别,主要步骤包括: + + * 删除开始和结尾的空白字符。 + * 删除英文标点和中文标点。 + * 在两个字符之间插入空白字符。 + +请注意,发布的语言模型只包含中文简体字。预处理完成后,我们开始训练语言模型。这个小的语言模型训练关键参数是“-o 5 --prune 0 1 2 4 4”,“-o 5”是针对大语言模型。请参考上面的部分了解每个参数的含义。我们还使用默认设置将 arpa 文件转换为二进制文件。 + +### 语音到文本推断 + +推断模块使用`infer.py`进行调用,可以用来推断,解码,以及输出一些给定音频片段可视化到文本的结果。这有助于对ASR模型的性能进行直观和定性的评估。 + +- GPU 版本的推断: + + ```bash + CUDA_VISIBLE_DEVICES=0 python3 infer.py + ``` + +- CPU 版本的推断: + + ```bash + python3 infer.py --use_gpu False + ``` + +我们提供两种类型的 CTC 解码器:*CTC贪心解码器*和*CTC波束搜索解码器*。*CTC贪心解码器*是简单的最佳路径解码算法的实现,在每个时间步选择最可能的字符,因此是贪心的并且是局部最优的。[*CTC波束搜索解码器*](https://arxiv.org/abs/1408.2873)另外使用了启发式广度优先图搜索以达到近似全局最优; 它也需要预先训练的KenLM语言模型以获得更好的评分和排名。解码器类型可以用参数`--decoding_method`设置。 + +获得更多帮助: + +``` +python3 infer.py --help +``` +或参考`example/librispeech/run_infer.sh`. + +### 评估模型 + +要定量评估模型的性能,请运行: + +- GPU 版本评估 + + ```bash + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 test.py + ``` + +- CPU 版本评估 + + ```bash + python3 test.py --use_gpu False + ``` + +错误率(默认:误字率;可以用--error_rate_type设置)将被打印出来。 + +获得更多帮助: + +```bash +python3 test.py --help +``` +或参考`example/librispeech/run_test.sh`. + +## 超参数调整 + +[*CTC波束搜索解码器*](https://arxiv.org/abs/1408.2873)的超参数$\alpha$(语言模型权重)和$\beta$(单词插入权重)对解码器的性能有非常显著的影响。当声学模型更新时,最好在验证集上重新调整它们。 + +`tools/tune.py`会进行2维网格查找超参数$\alpha$和$\beta$。你必须提供$\alpha$和$\beta$的范围,以及尝试的次数。 + +- GPU 版的调整: + + ```bash + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + python3 tools/tune.py \ + --alpha_from 1.0 \ + --alpha_to 3.2 \ + --num_alphas 45 \ + --beta_from 0.1 \ + --beta_to 0.45 \ + --num_betas 8 + ``` + +- CPU 版的调整: + + ```bash + python3 tools/tune.py --use_gpu False + ``` +网格搜索将会在超参数空间的每个点处打印出 WER (误字率)或者 CER (字符错误率),并且可绘出误差曲面。一个合适的超参数范围应包括 WER/CER 误差表面的全局最小值,如下图所示。 + +

+ +
调整LibriSpeech的dev-clean集合的误差曲面示例 +

+ +通常,如图所示,语言模型权重($\alpha$)的变化显著影响 CTC波束搜索解码器的性能。更好的方法是首先调整多批数据(可指定数量)以找出适当的超参数范围,然后更改为完整的验证集以进行精确调整。 + +调整之后,您可以在推理和评价模块中重置$\alpha$和$\beta$,以检查它们是否真的有助于提高 ASR 性能。更多帮助如下: + +```bash +python3 tune.py --help +``` +或参考`example/librispeech/run_tune.sh`. + +## 训练普通话语言 + +普通话语言训练与英语训练的关键步骤相同,我们提供了一个使用 Aishell 进行普通话训练的例子```examples/aishell```。如上所述,请执行```sh run_data.sh```, ```sh run_train.sh```, ```sh run_test.sh```和```sh run_infer.sh```做相应的数据准备,训练,测试和推断。我们还准备了一个预训练过的模型(执行./models/aishell/download_model.sh下载)供用户使用```run_infer_golden.sh```和```run_test_golden.sh```来。请注意,与英语语言模型不同,普通话语言模型是基于汉字的,请运行```tools/tune.py```来查找最佳设置。 + +## 用自己的声音尝试现场演示 + +到目前为止,一个 ASR 模型已经训练完毕,并且用现有的音频文件进行了定性测试(`infer.py`)和定量测试(`test.py`)。但目前还没有用你自己的声音进行测试。`deploy/demo_english_server.py`和`deploy/demo_client.py`能够快速构建一个利用已训练好的模型对ASR引擎进行实时演示的系统,使你能够用自己的语音测试和演示。 + +要启动演示服务,请在控制台中运行: + +```bash +CUDA_VISIBLE_DEVICES=0 \ +python3 deploy/demo_server.py \ +--host_ip localhost \ +--host_port 8086 +``` + +对于运行 demo 客户端的机器(可能不是同一台机器),请在继续之前执行以下安装。 + +比如,对于 MAC OS X 机器: + +```bash +brew install portaudio +pip install pyaudio +pip install keyboard +``` + +然后启动客户端,请在另一个控制台中运行: + +```bash +CUDA_VISIBLE_DEVICES=0 \ +python3 -u deploy/demo_client.py \ +--host_ip 'localhost' \ +--host_port 8086 +``` + +现在,在客户端控制台中,按下`空格`键,按住并开始讲话。讲话完毕请释放该键以让控制台中显示语音的文本结果。要退出客户端,只需按`ESC`键。 + +请注意,`deploy/demo_client.py`必须在带麦克风设备的机器上运行,而`deploy/demo_server.py`可以在没有任何录音硬件的情况下运行,例如任何远程服务器机器。如果服务器和客户端使用两台独立的机器运行,只需要注意将`host_ip`和`host_port`参数设置为实际可访问的IP地址和端口。如果它们在单台机器上运行,则不用作任何处理。 + +请参考`examples/deploy_demo/run_english_demo_server.sh`,它将首先下载一个预先训练过的英语模型(用3000小时的内部语音数据训练),然后用模型启动演示服务器。通过运行`examples/mandarin/run_demo_client.sh`,你可以说英语来测试它。如果您想尝试其他模型,只需更新脚本中的`--model_path`参数即可。 + +获得更多帮助: + +```bash +python3 deploy/demo_server.py --help +python3 deploy/demo_client.py --help +``` + +## 发布模型 + +#### 语音模型发布 + +语种 | 模型名 | 训练数据 | 语音时长 +:-----------: | :------------: | :----------: | -------: +English | [LibriSpeech Model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz) | [LibriSpeech Dataset](http://www.openslr.org/12/) | 960 h +English | [BaiduEN8k Model](https://deepspeech.bj.bcebos.com/demo_models/baidu_en8k_model_fluid.tar.gz) | Baidu Internal English Dataset | 8628 h +Mandarin | [Aishell Model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_fluid.tar.gz) | [Aishell Dataset](http://www.openslr.org/33/) | 151 h +Mandarin | [BaiduCN1.2k Model](https://deepspeech.bj.bcebos.com/demo_models/baidu_cn1.2k_model_fluid.tar.gz) | Baidu Internal Mandarin Dataset | 1204 h + +#### 语言模型发布 + +语言模型 | 训练数据 | 基于的字符 | 大小 | 描述 +:-------------:| :------------:| :-----: | -----: | :----------------- +[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1;
About 1.85 billion n-grams;
'trie' binary with '-a 22 -q 8 -b 8' +[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4;
About 0.13 billion n-grams;
'probing' binary with default settings +[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning;
About 3.7 billion n-grams;
'probing' binary with default settings + +## 实验和baseline + +#### 英语模型的baseline测试结果(字错误率) + +测试集 | LibriSpeech Model | BaiduEN8K Model +:--------------------- | ---------------: | -------------------: +LibriSpeech Test-Clean | 6.85 | 5.41 +LibriSpeech Test-Other | 21.18 | 13.85 +VoxForge American-Canadian | 12.12 | 7.13 +VoxForge Commonwealth | 19.82 | 14.93 +VoxForge European | 30.15 | 18.64 +VoxForge Indian | 53.73 | 25.51 +Baidu Internal Testset | 40.75 | 8.48 + +为了在VoxForge数据上重现基准测试结果,我们提供了一个脚本来下载数据并生成VoxForge方言manifest文件。请到```data/voxforge```执行````run_data.sh```来获取VoxForge方言manifest文件。请注意,VoxForge数据可能会持续更新,生成的清单文件可能与我们评估的清单文件有所不同。 + + +#### 普通话模型的baseline测试结果(字符错误率) + +测试集 | BaiduCN1.2k Model +:--------------------- | -------------------: +Baidu Internal Testset | 12.64 + +#### 多GPU加速 + +我们对1,2,4,8个Tesla V100 GPU的训练时间(LibriSpeech样本的子集,其音频持续时间介于6.0和7.0秒之间)进行比较。它表明,已经实现了具有多个GPU的**近线性**加速。在下图中,训练的时间(以秒为单位)显示在蓝色条上。 + +
+ +| # of GPU | 加速比 | +| -------- | --------------: | +| 1 | 1.00 X | +| 2 | 1.98 X | +| 4 | 3.73 X | +| 8 | 6.95 X | + +`tools/profile.sh`提供了上述分析工具. + +## 问题和帮助 + +欢迎您在[Github问题](https://github.com/PaddlePaddle/models/issues)中提交问题和bug。也欢迎您为这个项目做出贡献。 diff --git a/docs/README_old.md b/docs/README_old.md new file mode 100644 index 000000000..6f21a178a --- /dev/null +++ b/docs/README_old.md @@ -0,0 +1,507 @@ +# DeepSpeech2 on PaddlePaddle + +[中文版](README_cn.md) + +*DeepSpeech2 on PaddlePaddle* is an open-source implementation of end-to-end Automatic Speech Recognition (ASR) engine, based on [Baidu's Deep Speech 2 paper](http://proceedings.mlr.press/v48/amodei16.pdf), with [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform. Our vision is to empower both industrial application and academic research on speech recognition, via an easy-to-use, efficient and scalable implementation, including training, inference & testing module, and demo deployment. Besides, several pre-trained models for both English and Mandarin are also released. + +## Table of Contents +- [Installation](#installation) +- [Running in Docker Container](#running-in-docker-container) +- [Getting Started](#getting-started) +- [Data Preparation](#data-preparation) +- [Training a Model](#training-a-model) +- [Data Augmentation Pipeline](#data-augmentation-pipeline) +- [Inference and Evaluation](#inference-and-evaluation) +- [Hyper-parameters Tuning](#hyper-parameters-tuning) +- [Training for Mandarin Language](#training-for-mandarin-language) +- [Trying Live Demo with Your Own Voice](#trying-live-demo-with-your-own-voice) +- [Released Models](#released-models) +- [Experiments and Benchmarks](#experiments-and-benchmarks) +- [Questions and Help](#questions-and-help) + + + +## Installation + +To avoid the trouble of environment setup, [running in Docker container](#running-in-docker-container) is highly recommended. Otherwise follow the guidelines below to install the dependencies manually. + +### Prerequisites +- Python >= 3.7 +- PaddlePaddle 1.8.5 (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)) + +### Setup +- Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost` and `swig`, e.g. installing them via `apt-get`: + +```bash +sudo apt-get install -y pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev +``` + +or, installing them via `yum`: + +```bash +sudo yum install pkgconfig libogg-devel libvorbis-devel boost-devel python3-devel +wget https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.1.tar.xz +xz -d flac-1.3.1.tar.xz +tar -xvf flac-1.3.1.tar +cd flac-1.3.1 +./configure +make +make install +``` + +- Run the setup script for the remaining dependencies + +```bash +git clone https://github.com/PaddlePaddle/DeepSpeech.git +cd DeepSpeech +pushd tools; make; popd +source tools/venv/bin/activate +bash setup.sh +``` + +- Source venv before do experiment. + +```bash +source tools/venv/bin/activate +``` + +### Running in Docker Container + +Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed, including the pre-built PaddlePaddle, CTC decoders, and other necessary Python and third-party packages. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed. + +Take several steps to launch the Docker image: + +- Download the Docker image + +```bash +nvidia-docker pull hub.baidubce.com/paddlepaddle/deep_speech_fluid:latest-gpu +``` + +- Clone this repository + +``` +git clone https://github.com/PaddlePaddle/DeepSpeech.git +``` + +- Run the Docker image + +```bash +sudo nvidia-docker run -it -v $(pwd)/DeepSpeech:/DeepSpeech hub.baidubce.com/paddlepaddle/deep_speech_fluid:latest-gpu /bin/bash +``` +Now go back and start from the [Getting Started](#getting-started) section, you can execute training, inference and hyper-parameters tuning similarly in the Docker container. + + +- Install PaddlePaddle + +For example, for CUDA 10.1, CuDNN7.5: +```bash +python3 -m pip install paddlepaddle-gpu==1.8.0.post107 +``` + +## Getting Started + +Several shell scripts provided in `./examples` will help us to quickly give it a try, for most major modules, including data preparation, model training, case inference and model evaluation, with a few public dataset (e.g. [LibriSpeech](http://www.openslr.org/12/), [Aishell](http://www.openslr.org/33)). Reading these examples will also help you to understand how to make it work with your own data. + +Some of the scripts in `./examples` are configured with 8 GPUs. If you don't have 8 GPUs available, please modify `CUDA_VISIBLE_DEVICES`. If you don't have any GPU available, please set `--use_gpu` to False to use CPUs instead. Besides, if out-of-memory problem occurs, just reduce `--batch_size` to fit. + +Let's take a tiny sampled subset of [LibriSpeech dataset](http://www.openslr.org/12/) for instance. + +- Go to directory + + ```bash + cd examples/tiny + ``` + + Notice that this is only a toy example with a tiny sampled subset of LibriSpeech. If you would like to try with the complete dataset (would take several days for training), please go to `examples/librispeech` instead. +- Prepare the data + + ```bash + sh run_data.sh + ``` + + `run_data.sh` will download dataset, generate manifests, collect normalizer's statistics and build vocabulary. Once the data preparation is done, you will find the data (only part of LibriSpeech) downloaded in `./dataset/librispeech` and the corresponding manifest files generated in `./data/tiny` as well as a mean stddev file and a vocabulary file. It has to be run for the very first time you run this dataset and is reusable for all further experiments. +- Train your own ASR model + + ```bash + sh run_train.sh + ``` + + `run_train.sh` will start a training job, with training logs printed to stdout and model checkpoint of every pass/epoch saved to `./checkpoints/tiny`. These checkpoints could be used for training resuming, inference, evaluation and deployment. +- Case inference with an existing model + + ```bash + sh run_infer.sh + ``` + + `run_infer.sh` will show us some speech-to-text decoding results for several (default: 10) samples with the trained model. The performance might not be good now as the current model is only trained with a toy subset of LibriSpeech. To see the results with a better model, you can download a well-trained (trained for several days, with the complete LibriSpeech) model and do the inference: + + ```bash + sh run_infer_golden.sh + ``` +- Evaluate an existing model + + ```bash + sh run_test.sh + ``` + + `run_test.sh` will evaluate the model with Word Error Rate (or Character Error Rate) measurement. Similarly, you can also download a well-trained model and test its performance: + + ```bash + sh run_test_golden.sh + ``` + +More detailed information are provided in the following sections. Wish you a happy journey with the *DeepSpeech2 on PaddlePaddle* ASR engine! + + +## Data Preparation + +### Generate Manifest + +*DeepSpeech2 on PaddlePaddle* accepts a textual **manifest** file as its data set interface. A manifest file summarizes a set of speech data, with each line containing some meta data (e.g. filepath, transcription, duration) of one audio clip, in [JSON](http://www.json.org/) format, such as: + +``` +{"audio_filepath": "/home/work/.cache/paddle/Libri/134686/1089-134686-0001.flac", "duration": 3.275, "text": "stuff it into you his belly counselled him"} +{"audio_filepath": "/home/work/.cache/paddle/Libri/134686/1089-134686-0007.flac", "duration": 4.275, "text": "a cold lucid indifference reigned in his soul"} +``` + +To use your custom data, you only need to generate such manifest files to summarize the dataset. Given such summarized manifests, training, inference and all other modules can be aware of where to access the audio files, as well as their meta data including the transcription labels. + +For how to generate such manifest files, please refer to `data/librispeech/librispeech.py`, which will download data and generate manifest files for LibriSpeech dataset. + +### Compute Mean & Stddev for Normalizer + +To perform z-score normalization (zero-mean, unit stddev) upon audio features, we have to estimate in advance the mean and standard deviation of the features, with some training samples: + +```bash +python3 tools/compute_mean_std.py \ +--num_samples 2000 \ +--specgram_type linear \ +--manifest_path data/librispeech/manifest.train \ +--output_path data/librispeech/mean_std.npz +``` + +It will compute the mean and standard deviatio of power spectrum feature with 2000 random sampled audio clips listed in `data/librispeech/manifest.train` and save the results to `data/librispeech/mean_std.npz` for further usage. + + +### Build Vocabulary + +A vocabulary of possible characters is required to convert the transcription into a list of token indices for training, and in decoding, to convert from a list of indices back to text again. Such a character-based vocabulary can be built with `tools/build_vocab.py`. + +```bash +python3 tools/build_vocab.py \ +--count_threshold 0 \ +--vocab_path data/librispeech/eng_vocab.txt \ +--manifest_paths data/librispeech/manifest.train +``` + +It will write a vocabuary file `data/librispeeech/eng_vocab.txt` with all transcription text in `data/librispeech/manifest.train`, without vocabulary truncation (`--count_threshold 0`). + +### More Help + +For more help on arguments: + +```bash +python3 data/librispeech/librispeech.py --help +python3 tools/compute_mean_std.py --help +python3 tools/build_vocab.py --help +``` + +## Training a model + +`train.py` is the main caller of the training module. Examples of usage are shown below. + +- Start training from scratch with 8 GPUs: + + ``` + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 train.py + ``` + +- Start training from scratch with CPUs: + + ``` + python3 train.py --use_gpu False + ``` +- Resume training from a checkpoint: + + ``` + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + python3 train.py \ + --init_from_pretrained_model CHECKPOINT_PATH_TO_RESUME_FROM + ``` + +For more help on arguments: + +```bash +python3 train.py --help +``` +or refer to `example/librispeech/run_train.sh`. + + +## Data Augmentation Pipeline + +Data augmentation has often been a highly effective technique to boost the deep learning performance. We augment our speech data by synthesizing new audios with small random perturbation (label-invariant transformation) added upon raw audios. You don't have to do the syntheses on your own, as it is already embedded into the data provider and is done on the fly, randomly for each epoch during training. + +Six optional augmentation components are provided to be selected, configured and inserted into the processing pipeline. + + - Volume Perturbation + - Speed Perturbation + - Shifting Perturbation + - Online Bayesian normalization + - Noise Perturbation (need background noise audio files) + - Impulse Response (need impulse audio files) + +In order to inform the trainer of what augmentation components are needed and what their processing orders are, it is required to prepare in advance an *augmentation configuration file* in [JSON](http://www.json.org/) format. For example: + +``` +[{ + "type": "speed", + "params": {"min_speed_rate": 0.95, + "max_speed_rate": 1.05}, + "prob": 0.6 +}, +{ + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 0.8 +}] +``` + +When the `--augment_conf_file` argument of `trainer.py` is set to the path of the above example configuration file, every audio clip in every epoch will be processed: with 60% of chance, it will first be speed perturbed with a uniformly random sampled speed-rate between 0.95 and 1.05, and then with 80% of chance it will be shifted in time with a random sampled offset between -5 ms and 5 ms. Finally this newly synthesized audio clip will be feed into the feature extractor for further training. + +For other configuration examples, please refer to `conf/augmenatation.config.example`. + +Be careful when utilizing the data augmentation technique, as improper augmentation will do harm to the training, due to the enlarged train-test gap. + +## Inference and Evaluation + +### Prepare Language Model + +A language model is required to improve the decoder's performance. We have prepared two language models (with lossy compression) for users to download and try. One is for English and the other is for Mandarin. Users can simply run this to download the preprared language models: + +```bash +cd models/lm +bash download_lm_en.sh +bash download_lm_ch.sh +``` + +If you wish to train your own better language model, please refer to [KenLM](https://github.com/kpu/kenlm) for tutorials. Here we provide some tips to show how we preparing our English and Mandarin language models. You can take it as a reference when you train your own. + +#### English LM + +The English corpus is from the [Common Crawl Repository](http://commoncrawl.org) and you can download it from [statmt](http://data.statmt.org/ngrams/deduped_en). We use part en.00 to train our English language model. There are some preprocessing steps before training: + + * Characters not in \['A-Za-z0-9\s'\] (\s represents whitespace characters) are removed and Arabic numbers are converted to English numbers like 1000 to one thousand. + * Repeated whitespace characters are squeezed to one and the beginning whitespace characters are removed. Notice that all transcriptions are lowercase, so all characters are converted to lowercase. + * Top 400,000 most frequent words are selected to build the vocabulary and the rest are replaced with 'UNKNOWNWORD'. + +Now the preprocessing is done and we get a clean corpus to train the language model. Our released language model are trained with agruments '-o 5 --prune 0 1 1 1 1'. '-o 5' means the max order of language model is 5. '--prune 0 1 1 1 1' represents count thresholds for each order and more specifically it will prune singletons for orders two and higher. To save disk storage we convert the arpa file to 'trie' binary file with arguments '-a 22 -q 8 -b 8'. '-a' represents the maximum number of leading bits of pointers in 'trie' to chop. '-q -b' are quantization parameters for probability and backoff. + +#### Mandarin LM + +Different from the English language model, Mandarin language model is character-based where each token is a Chinese character. We use internal corpus to train the released Mandarin language models. The corpus contain billions of tokens. The preprocessing has tiny difference from English language model and main steps include: + + * The beginning and trailing whitespace characters are removed. + * English punctuations and Chinese punctuations are removed. + * A whitespace character between two tokens is inserted. + +Please notice that the released language models only contain Chinese simplified characters. After preprocessing done we can begin to train the language model. The key training arguments for small LM is '-o 5 --prune 0 1 2 4 4' and '-o 5' for large LM. Please refer above section for the meaning of each argument. We also convert the arpa file to binary file using default settings. + +### Speech-to-text Inference + +An inference module caller `infer.py` is provided to infer, decode and visualize speech-to-text results for several given audio clips. It might help to have an intuitive and qualitative evaluation of the ASR model's performance. + +- Inference with GPU: + + ```bash + CUDA_VISIBLE_DEVICES=0 python3 infer.py + ``` + +- Inference with CPUs: + + ```bash + python3 infer.py --use_gpu False + ``` + +We provide two types of CTC decoders: *CTC greedy decoder* and *CTC beam search decoder*. The *CTC greedy decoder* is an implementation of the simple best-path decoding algorithm, selecting at each timestep the most likely token, thus being greedy and locally optimal. The [*CTC beam search decoder*](https://arxiv.org/abs/1408.2873) otherwise utilizes a heuristic breadth-first graph search for reaching a near global optimality; it also requires a pre-trained KenLM language model for better scoring and ranking. The decoder type can be set with argument `--decoding_method`. + +For more help on arguments: + +``` +python3 infer.py --help +``` +or refer to `example/librispeech/run_infer.sh`. + +### Evaluate a Model + +To evaluate a model's performance quantitatively, please run: + +- Evaluation with GPUs: + + ```bash + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 test.py + ``` + +- Evaluation with CPUs: + + ```bash + python3 test.py --use_gpu False + ``` + +The error rate (default: word error rate; can be set with `--error_rate_type`) will be printed. + +For more help on arguments: + +```bash +python3 test.py --help +``` +or refer to `example/librispeech/run_test.sh`. + +## Hyper-parameters Tuning + +The hyper-parameters $\alpha$ (language model weight) and $\beta$ (word insertion weight) for the [*CTC beam search decoder*](https://arxiv.org/abs/1408.2873) often have a significant impact on the decoder's performance. It would be better to re-tune them on the validation set when the acoustic model is renewed. + +`tools/tune.py` performs a 2-D grid search over the hyper-parameter $\alpha$ and $\beta$. You must provide the range of $\alpha$ and $\beta$, as well as the number of their attempts. + +- Tuning with GPU: + + ```bash + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + python3 tools/tune.py \ + --alpha_from 1.0 \ + --alpha_to 3.2 \ + --num_alphas 45 \ + --beta_from 0.1 \ + --beta_to 0.45 \ + --num_betas 8 + ``` + +- Tuning with CPU: + + ```bash + python3 tools/tune.py --use_gpu False + ``` + The grid search will print the WER (word error rate) or CER (character error rate) at each point in the hyper-parameters space, and draw the error surface optionally. A proper hyper-parameters range should include the global minima of the error surface for WER/CER, as illustrated in the following figure. + +

+ +
An example error surface for tuning on the dev-clean set of LibriSpeech +

+ +Usually, as the figure shows, the variation of language model weight ($\alpha$) significantly affect the performance of CTC beam search decoder. And a better procedure is to first tune on serveral data batches (the number can be specified) to find out the proper range of hyper-parameters, then change to the whole validation set to carray out an accurate tuning. + +After tuning, you can reset $\alpha$ and $\beta$ in the inference and evaluation modules to see if they really help improve the ASR performance. For more help + +```bash +python3 tune.py --help +``` +or refer to `example/librispeech/run_tune.sh`. + +## Training for Mandarin Language + +The key steps of training for Mandarin language are same to that of English language and we have also provided an example for Mandarin training with Aishell in ```examples/aishell```. As mentioned above, please execute ```sh run_data.sh```, ```sh run_train.sh```, ```sh run_test.sh``` and ```sh run_infer.sh``` to do data preparation, training, testing and inference correspondingly. We have also prepared a pre-trained model (downloaded by ./models/aishell/download_model.sh) for users to try with ```sh run_infer_golden.sh``` and ```sh run_test_golden.sh```. Notice that, different from English LM, the Mandarin LM is character-based and please run ```tools/tune.py``` to find an optimal setting. + +## Trying Live Demo with Your Own Voice + +Until now, an ASR model is trained and tested qualitatively (`infer.py`) and quantitatively (`test.py`) with existing audio files. But it is not yet tested with your own speech. `deploy/demo_english_server.py` and `deploy/demo_client.py` helps quickly build up a real-time demo ASR engine with the trained model, enabling you to test and play around with the demo, with your own voice. + +To start the demo's server, please run this in one console: + +```bash +CUDA_VISIBLE_DEVICES=0 \ +python3 deploy/demo_server.py \ +--host_ip localhost \ +--host_port 8086 +``` + +For the machine (might not be the same machine) to run the demo's client, please do the following installation before moving on. + +For example, on MAC OS X: + +```bash +brew install portaudio +pip install pyaudio +pip install keyboard +``` + +Then to start the client, please run this in another console: + +```bash +CUDA_VISIBLE_DEVICES=0 \ +python3 -u deploy/demo_client.py \ +--host_ip 'localhost' \ +--host_port 8086 +``` + +Now, in the client console, press the `whitespace` key, hold, and start speaking. Until finishing your utterance, release the key to let the speech-to-text results shown in the console. To quit the client, just press `ESC` key. + +Notice that `deploy/demo_client.py` must be run on a machine with a microphone device, while `deploy/demo_server.py` could be run on one without any audio recording hardware, e.g. any remote server machine. Just be careful to set the `host_ip` and `host_port` argument with the actual accessible IP address and port, if the server and client are running with two separate machines. Nothing should be done if they are running on one single machine. + +Please also refer to `examples/deploy_demo/run_english_demo_server.sh`, which will first download a pre-trained English model (trained with 3000 hours of internal speech data) and then start the demo server with the model. With running `examples/mandarin/run_demo_client.sh`, you can speak English to test it. If you would like to try some other models, just update `--model_path` argument in the script. + +For more help on arguments: + +```bash +python3 deploy/demo_server.py --help +python3 deploy/demo_client.py --help +``` + +## Released Models + +#### Speech Model Released + +Language | Model Name | Training Data | Hours of Speech +:-----------: | :------------: | :----------: | -------: +English | [LibriSpeech Model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz) | [LibriSpeech Dataset](http://www.openslr.org/12/) | 960 h +English | [BaiduEN8k Model](https://deepspeech.bj.bcebos.com/demo_models/baidu_en8k_model_fluid.tar.gz) | Baidu Internal English Dataset | 8628 h +Mandarin | [Aishell Model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_fluid.tar.gz) | [Aishell Dataset](http://www.openslr.org/33/) | 151 h +Mandarin | [BaiduCN1.2k Model](https://deepspeech.bj.bcebos.com/demo_models/baidu_cn1.2k_model_fluid.tar.gz) | Baidu Internal Mandarin Dataset | 1204 h + +#### Language Model Released + +Language Model | Training Data | Token-based | Size | Descriptions +:-------------:| :------------:| :-----: | -----: | :----------------- +[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1;
About 1.85 billion n-grams;
'trie' binary with '-a 22 -q 8 -b 8' +[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4;
About 0.13 billion n-grams;
'probing' binary with default settings +[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning;
About 3.7 billion n-grams;
'probing' binary with default settings + +## Experiments and Benchmarks + +#### Benchmark Results for English Models (Word Error Rate) + +Test Set | LibriSpeech Model | BaiduEN8K Model +:--------------------- | ---------------: | -------------------: +LibriSpeech Test-Clean | 6.85 | 5.41 +LibriSpeech Test-Other | 21.18 | 13.85 +VoxForge American-Canadian | 12.12 | 7.13 +VoxForge Commonwealth | 19.82 | 14.93 +VoxForge European | 30.15 | 18.64 +VoxForge Indian | 53.73 | 25.51 +Baidu Internal Testset | 40.75 | 8.48 + +For reproducing benchmark results on VoxForge data, we provide a script to download data and generate VoxForge dialect manifest files. Please go to ```data/voxforge``` and execute ```sh run_data.sh``` to get VoxForge dialect manifest files. Notice that VoxForge data may keep updating and the generated manifest files may have difference from those we evaluated on. + +#### Benchmark Results for Mandarin Model (Character Error Rate) + +Test Set | BaiduCN1.2k Model +:--------------------- | -------------------: +Baidu Internal Testset | 12.64 + +#### Acceleration with Multi-GPUs + +We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of LibriSpeech samples whose audio durations are between 6.0 and 7.0 seconds). And it shows that a **near-linear** acceleration with multiple GPUs has been achieved. In the following figure, the time (in seconds) cost for training is printed on the blue bars. + +
+ +| # of GPU | Acceleration Rate | +| -------- | --------------: | +| 1 | 1.00 X | +| 2 | 1.98 X | +| 4 | 3.73 X | +| 8 | 6.95 X | + +`tools/profile.sh` provides such a profiling tool. + +## Questions and Help + +You are welcome to submit questions and bug reports in [Github Issues](https://github.com/PaddlePaddle/DeepSpeech/issues). You are also welcome to contribute to this project. diff --git a/docs/augmentation.md b/docs/augmentation.md new file mode 100644 index 000000000..e4b7c0012 --- /dev/null +++ b/docs/augmentation.md @@ -0,0 +1,36 @@ + +# Data Augmentation Pipeline + +Data augmentation has often been a highly effective technique to boost the deep learning performance. We augment our speech data by synthesizing new audios with small random perturbation (label-invariant transformation) added upon raw audios. You don't have to do the syntheses on your own, as it is already embedded into the data provider and is done on the fly, randomly for each epoch during training. + +Six optional augmentation components are provided to be selected, configured and inserted into the processing pipeline. + + - Volume Perturbation + - Speed Perturbation + - Shifting Perturbation + - Online Bayesian normalization + - Noise Perturbation (need background noise audio files) + - Impulse Response (need impulse audio files) + +In order to inform the trainer of what augmentation components are needed and what their processing orders are, it is required to prepare in advance an *augmentation configuration file* in [JSON](http://www.json.org/) format. For example: + +``` +[{ + "type": "speed", + "params": {"min_speed_rate": 0.95, + "max_speed_rate": 1.05}, + "prob": 0.6 +}, +{ + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 0.8 +}] +``` + +When the `augment_conf_file` argument is set to the path of the above example configuration file, every audio clip in every epoch will be processed: with 60% of chance, it will first be speed perturbed with a uniformly random sampled speed-rate between 0.95 and 1.05, and then with 80% of chance it will be shifted in time with a random sampled offset between -5 ms and 5 ms. Finally this newly synthesized audio clip will be feed into the feature extractor for further training. + +For other configuration examples, please refer to `examples/conf/augmentation.config.example`. + +Be careful when utilizing the data augmentation technique, as improper augmentation will do harm to the training, due to the enlarged train-test gap. diff --git a/docs/benchmark.md b/docs/benchmark.md new file mode 100644 index 000000000..4ef3e680c --- /dev/null +++ b/docs/benchmark.md @@ -0,0 +1,16 @@ +# Benchmarks + +## Acceleration with Multi-GPUs + +We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of LibriSpeech samples whose audio durations are between 6.0 and 7.0 seconds). And it shows that a **near-linear** acceleration with multiple GPUs has been achieved. In the following figure, the time (in seconds) cost for training is printed on the blue bars. + +
+ +| # of GPU | Acceleration Rate | +| -------- | --------------: | +| 1 | 1.00 X | +| 2 | 1.98 X | +| 4 | 3.73 X | +| 8 | 6.95 X | + +`utils/profile.sh` provides such a demo profiling tool, you can change it as need. diff --git a/docs/data_preparation.md b/docs/data_preparation.md new file mode 100644 index 000000000..7b6142bde --- /dev/null +++ b/docs/data_preparation.md @@ -0,0 +1,43 @@ + +# Data Preparation + +## Generate Manifest + +*DeepSpeech2 on PaddlePaddle* accepts a textual **manifest** file as its data set interface. A manifest file summarizes a set of speech data, with each line containing some meta data (e.g. filepath, transcription, duration) of one audio clip, in [JSON](http://www.json.org/) format, such as: + +``` +{"audio_filepath": "/home/work/.cache/paddle/Libri/134686/1089-134686-0001.flac", "duration": 3.275, "text": "stuff it into you his belly counselled him"} +{"audio_filepath": "/home/work/.cache/paddle/Libri/134686/1089-134686-0007.flac", "duration": 4.275, "text": "a cold lucid indifference reigned in his soul"} +``` + +To use your custom data, you only need to generate such manifest files to summarize the dataset. Given such summarized manifests, training, inference and all other modules can be aware of where to access the audio files, as well as their meta data including the transcription labels. + +For how to generate such manifest files, please refer to `examples/librispeech/local/librispeech.py`, which will download data and generate manifest files for LibriSpeech dataset. + +## Compute Mean & Stddev for Normalizer + +To perform z-score normalization (zero-mean, unit stddev) upon audio features, we have to estimate in advance the mean and standard deviation of the features, with some training samples: + +```bash +python3 utils/compute_mean_std.py \ +--num_samples 2000 \ +--specgram_type linear \ +--manifest_path examples/librispeech/data/manifest.train \ +--output_path examples/librispeech/data/mean_std.npz +``` + +It will compute the mean and standard deviatio of power spectrum feature with 2000 random sampled audio clips listed in `examples/librispeech/data/manifest.train` and save the results to `examples/librispeech/data/mean_std.npz` for further usage. + + +## Build Vocabulary + +A vocabulary of possible characters is required to convert the transcription into a list of token indices for training, and in decoding, to convert from a list of indices back to text again. Such a character-based vocabulary can be built with `utils/build_vocab.py`. + +```bash +python3 utils/build_vocab.py \ +--count_threshold 0 \ +--vocab_path examples/librispeech/data/eng_vocab.txt \ +--manifest_paths examples/librispeech/data/manifest.train +``` + +It will write a vocabuary file `examples/librispeech/data/eng_vocab.txt` with all transcription text in `examples/librispeech/data/manifest.train`, without vocabulary truncation (`--count_threshold 0`). diff --git a/docs/geting_started.md b/docs/geting_started.md new file mode 100644 index 000000000..fddb639a9 --- /dev/null +++ b/docs/geting_started.md @@ -0,0 +1,80 @@ +# Getting Started + +Several shell scripts provided in `./examples/tiny/local` will help us to quickly give it a try, for most major modules, including data preparation, model training, case inference and model evaluation, with a few public dataset (e.g. [LibriSpeech](http://www.openslr.org/12/), [Aishell](http://www.openslr.org/33)). Reading these examples will also help you to understand how to make it work with your own data. + +Some of the scripts in `./examples` are not configured with GPUs. If you want to train with 8 GPUs, please modify `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`. If you don't have any GPU available, please set `CUDA_VISIBLE_DEVICES=` to use CPUs instead. Besides, if out-of-memory problem occurs, just reduce `batch_size` to fit. + +Let's take a tiny sampled subset of [LibriSpeech dataset](http://www.openslr.org/12/) for instance. + +- Go to directory + + ```bash + cd examples/tiny + ``` + + Notice that this is only a toy example with a tiny sampled subset of LibriSpeech. If you would like to try with the complete dataset (would take several days for training), please go to `examples/librispeech` instead. + +- Source env + + ```bash + source path.sh + ``` + **Must do this before starting do anything.** + Set `MAIN_ROOT` as project dir. Using defualt `deepspeech2` model as default, you can change this in the script. + +- Main entrypoint + + ```bash + bash run.sh + ``` + This just a demo, please make sure every `step` is work fine when do next `step`. + +More detailed information are provided in the following sections. Wish you a happy journey with the *DeepSpeech on PaddlePaddle* ASR engine! + +## Training a model + +The key steps of training for Mandarin language are same to that of English language and we have also provided an example for Mandarin training with Aishell in ```examples/aishell/local```. As mentioned above, please execute ```sh data.sh```, ```sh train.sh```, ```sh test.sh``` and ```sh infer.sh``` to do data preparation, training, testing and inference correspondingly. We have also prepared a pre-trained model (downloaded by local/download_model.sh) for users to try with ```sh infer_golden.sh``` and ```sh test_golden.sh```. Notice that, different from English LM, the Mandarin LM is character-based and please run ```local/tune.sh``` to find an optimal setting. + +## Speech-to-text Inference + +An inference module caller `infer.py` is provided to infer, decode and visualize speech-to-text results for several given audio clips. It might help to have an intuitive and qualitative evaluation of the ASR model's performance. + +```bash +CUDA_VISIBLE_DEVICES=0 bash local/infer.sh +``` + +We provide two types of CTC decoders: *CTC greedy decoder* and *CTC beam search decoder*. The *CTC greedy decoder* is an implementation of the simple best-path decoding algorithm, selecting at each timestep the most likely token, thus being greedy and locally optimal. The [*CTC beam search decoder*](https://arxiv.org/abs/1408.2873) otherwise utilizes a heuristic breadth-first graph search for reaching a near global optimality; it also requires a pre-trained KenLM language model for better scoring and ranking. The decoder type can be set with argument `decoding_method`. + +## Evaluate a Model + +To evaluate a model's performance quantitatively, please run: + +```bash +CUDA_VISIBLE_DEVICES=0 bash local/test.sh +``` + +The error rate (default: word error rate; can be set with `error_rate_type`) will be printed. + +For more help on arguments: + +## Hyper-parameters Tuning + +The hyper-parameters $\alpha$ (language model weight) and $\beta$ (word insertion weight) for the [*CTC beam search decoder*](https://arxiv.org/abs/1408.2873) often have a significant impact on the decoder's performance. It would be better to re-tune them on the validation set when the acoustic model is renewed. + +`tune.py` performs a 2-D grid search over the hyper-parameter $\alpha$ and $\beta$. You must provide the range of $\alpha$ and $\beta$, as well as the number of their attempts. + + +```bash +CUDA_VISIBLE_DEVICES=0 bash local/tune.sh +``` + + The grid search will print the WER (word error rate) or CER (character error rate) at each point in the hyper-parameters space, and draw the error surface optionally. A proper hyper-parameters range should include the global minima of the error surface for WER/CER, as illustrated in the following figure. + +

+ +
An example error surface for tuning on the dev-clean set of LibriSpeech +

+ +Usually, as the figure shows, the variation of language model weight ($\alpha$) significantly affect the performance of CTC beam search decoder. And a better procedure is to first tune on serveral data batches (the number can be specified) to find out the proper range of hyper-parameters, then change to the whole validation set to carray out an accurate tuning. + +After tuning, you can reset $\alpha$ and $\beta$ in the inference and evaluation modules to see if they really help improve the ASR performance. For more help diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 000000000..71396590f --- /dev/null +++ b/docs/install.md @@ -0,0 +1,81 @@ +# Installation + +To avoid the trouble of environment setup, [running in Docker container](#running-in-docker-container) is highly recommended. Otherwise follow the guidelines below to install the dependencies manually. + +## Prerequisites +- Python >= 3.7 +- PaddlePaddle 2.0.0 or later (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)) + +## Setup + +- Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost` and `swig`, e.g. installing them via `apt-get`: + +```bash +sudo apt-get install -y pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev +``` + +or, installing them via `yum`: + +```bash +sudo yum install pkgconfig libogg-devel libvorbis-devel boost-devel python3-devel +wget https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.1.tar.xz +xz -d flac-1.3.1.tar.xz +tar -xvf flac-1.3.1.tar +cd flac-1.3.1 +./configure +make +make install +``` + +- Run the setup script for the remaining dependencies + +```bash +git clone https://github.com/PaddlePaddle/DeepSpeech.git +cd DeepSpeech +pushd tools; make; popd +source tools/venv/bin/activate +bash setup.sh +``` + +- Source venv before do experiment. + +```bash +source tools/venv/bin/activate +``` + +## Running in Docker Container + +Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed, including the pre-built PaddlePaddle, CTC decoders, and other necessary Python and third-party packages. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed. + +Take several steps to launch the Docker image: + +- Download the Docker image + +For example, pull paddle 2.0.0 image: + +```bash +nvidia-docker pull registry.baidubce.com/paddlepaddle/paddle:2.0.0-gpu-cuda10.1-cudnn7 +``` + +- Clone this repository + +``` +git clone https://github.com/PaddlePaddle/DeepSpeech.git +``` + +- Run the Docker image + +```bash +sudo nvidia-docker run --rm -it -v $(pwd)/DeepSpeech:/DeepSpeech registry.baidubce.com/paddlepaddle/paddle:2.0.0-gpu-cuda10.1-cudnn7 /bin/bash +``` + +Now you can execute training, inference and hyper-parameters tuning in the Docker container. + + +- Install PaddlePaddle + +For example, for CUDA 10.1, CuDNN7.5 install paddle 2.0.0: + +```bash +python3 -m pip install paddlepaddle-gpu==2.0.0 +``` diff --git a/docs/ngram_lm.md b/docs/ngram_lm.md new file mode 100644 index 000000000..48c557ce9 --- /dev/null +++ b/docs/ngram_lm.md @@ -0,0 +1,31 @@ +# Prepare Language Model + +A language model is required to improve the decoder's performance. We have prepared two language models (with lossy compression) for users to download and try. One is for English and the other is for Mandarin. Users can simply run this to download the preprared language models: + +```bash +cd examples/aishell +source path.sh +bash local/download_lm_ch.sh +``` + +If you wish to train your own better language model, please refer to [KenLM](https://github.com/kpu/kenlm) for tutorials. Here we provide some tips to show how we preparing our English and Mandarin language models. You can take it as a reference when you train your own. + +## English LM + +The English corpus is from the [Common Crawl Repository](http://commoncrawl.org) and you can download it from [statmt](http://data.statmt.org/ngrams/deduped_en). We use part en.00 to train our English language model. There are some preprocessing steps before training: + + * Characters not in \['A-Za-z0-9\s'\] (\s represents whitespace characters) are removed and Arabic numbers are converted to English numbers like 1000 to one thousand. + * Repeated whitespace characters are squeezed to one and the beginning whitespace characters are removed. Notice that all transcriptions are lowercase, so all characters are converted to lowercase. + * Top 400,000 most frequent words are selected to build the vocabulary and the rest are replaced with 'UNKNOWNWORD'. + +Now the preprocessing is done and we get a clean corpus to train the language model. Our released language model are trained with agruments '-o 5 --prune 0 1 1 1 1'. '-o 5' means the max order of language model is 5. '--prune 0 1 1 1 1' represents count thresholds for each order and more specifically it will prune singletons for orders two and higher. To save disk storage we convert the arpa file to 'trie' binary file with arguments '-a 22 -q 8 -b 8'. '-a' represents the maximum number of leading bits of pointers in 'trie' to chop. '-q -b' are quantization parameters for probability and backoff. + +## Mandarin LM + +Different from the English language model, Mandarin language model is character-based where each token is a Chinese character. We use internal corpus to train the released Mandarin language models. The corpus contain billions of tokens. The preprocessing has tiny difference from English language model and main steps include: + + * The beginning and trailing whitespace characters are removed. + * English punctuations and Chinese punctuations are removed. + * A whitespace character between two tokens is inserted. + +Please notice that the released language models only contain Chinese simplified characters. After preprocessing done we can begin to train the language model. The key training arguments for small LM is '-o 5 --prune 0 1 2 4 4' and '-o 5' for large LM. Please refer above section for the meaning of each argument. We also convert the arpa file to binary file using default settings. diff --git a/docs/released_model.md b/docs/released_model.md new file mode 100644 index 000000000..0919bba58 --- /dev/null +++ b/docs/released_model.md @@ -0,0 +1,9 @@ +# Released Models + +## Language Model Released + +Language Model | Training Data | Token-based | Size | Descriptions +:-------------:| :------------:| :-----: | -----: | :----------------- +[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1;
About 1.85 billion n-grams;
'trie' binary with '-a 22 -q 8 -b 8' +[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4;
About 0.13 billion n-grams;
'probing' binary with default settings +[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning;
About 3.7 billion n-grams;
'probing' binary with default settings diff --git a/docs/server.md b/docs/server.md new file mode 100644 index 000000000..019ebcfa4 --- /dev/null +++ b/docs/server.md @@ -0,0 +1,34 @@ + +# Trying Live Demo with Your Own Voice + +Until now, an ASR model is trained and tested qualitatively (`infer`) and quantitatively (`test`) with existing audio files. But it is not yet tested with your own speech. We build up a real-time demo ASR engine with the trained model, enabling you to test and play around with the demo, with your own voice. + +First, change your directory to `examples/aishell` and `source path.sh`. + +To start the demo's server, please run this in one console: + +```bash +CUDA_VISIBLE_DEVICES=0 bash local/server.sh +``` + +For the machine (might not be the same machine) to run the demo's client, please do the following installation before moving on. + +For example, on MAC OS X: + +```bash +brew install portaudio +pip install pyaudio +pip install keyboard +``` + +Then to start the client, please run this in another console: + +```bash +CUDA_VISIBLE_DEVICES=0 bash local/client.sh +``` + +Now, in the client console, press the `whitespace` key, hold, and start speaking. Until finishing your utterance, release the key to let the speech-to-text results shown in the console. To quit the client, just press `ESC` key. + +Notice that `deepspeech/exps/deepspeech2/deploy/client.py` must be run on a machine with a microphone device, while `deepspeech/exps/deepspeech2/deploy/server.py` could be run on one without any audio recording hardware, e.g. any remote server machine. Just be careful to set the `host_ip` and `host_port` argument with the actual accessible IP address and port, if the server and client are running with two separate machines. Nothing should be done if they are running on one single machine. + +Please also refer to `examples/aishell/local/server.sh`, which will first download a pre-trained Chinese model (trained with AISHELL1) and then start the demo server with the model. With running `examples/aishell/local/client.sh`, you can speak Chinese to test it. If you would like to try some other models, just update `--checkpoint_path` argument in the script.   diff --git a/examples/baidu_en8k/path.sh b/env.sh similarity index 69% rename from examples/baidu_en8k/path.sh rename to env.sh index fd1cebba8..131c6495a 100644 --- a/examples/baidu_en8k/path.sh +++ b/env.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../ +export MAIN_ROOT=${PWD} export PATH=${MAIN_ROOT}:${PWD}/tools:${PATH} export LC_ALL=C @@ -6,3 +6,5 @@ export LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ diff --git a/examples/aishell/.gitignore b/examples/aishell/.gitignore new file mode 100644 index 000000000..389676a70 --- /dev/null +++ b/examples/aishell/.gitignore @@ -0,0 +1,4 @@ +data +ckpt* +demo_cache +*.log diff --git a/conf/augmentation.config b/examples/aishell/conf/augmentation.config similarity index 100% rename from conf/augmentation.config rename to examples/aishell/conf/augmentation.config diff --git a/examples/aishell/conf/deepspeech2.yaml b/examples/aishell/conf/deepspeech2.yaml new file mode 100644 index 000000000..821c183e5 --- /dev/null +++ b/examples/aishell/conf/deepspeech2.yaml @@ -0,0 +1,47 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + mean_std_filepath: data/mean_std.npz + vocab_filepath: data/vocab.txt + augmentation_config: conf/augmentation.config + batch_size: 64 # one gpu + max_duration: 27.0 + min_duration: 0.0 + specgram_type: linear + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 20.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 +model: + num_conv_layers: 2 + num_rnn_layers: 3 + rnn_layer_size: 1024 + use_gru: True + share_rnn_weights: False +training: + n_epoch: 30 + lr: 5e-4 + lr_decay: 0.83 + weight_decay: 1e-06 + global_grad_clip: 5.0 +decoding: + batch_size: 128 + error_rate_type: cer + decoding_method: ctc_beam_search + lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm + alpha: 2.6 + beta: 5.0 + beam_size: 300 + cutoff_prob: 0.99 + cutoff_top_n: 40 + num_proc_bsearch: 10 diff --git a/examples/deploy_demo/run_demo_client.sh b/examples/aishell/local/client.sh similarity index 60% rename from examples/deploy_demo/run_demo_client.sh rename to examples/aishell/local/client.sh index 60581c661..d626ecc75 100644 --- a/examples/deploy_demo/run_demo_client.sh +++ b/examples/aishell/local/client.sh @@ -2,9 +2,13 @@ source path.sh +# run on MacOS +# brew install portaudio +# pip install pyaudio +# pip install keyboard + # start demo client -CUDA_VISIBLE_DEVICES=0 \ -python3 -u ${MAIN_ROOT}/deploy/demo_client.py \ +python3 -u ${BIN_DIR}/deploy/client.py \ --host_ip="localhost" \ --host_port=8086 \ @@ -13,5 +17,4 @@ if [ $? -ne 0 ]; then exit 1 fi - exit 0 diff --git a/examples/aishell/local/run_data.sh b/examples/aishell/local/data.sh similarity index 73% rename from examples/aishell/local/run_data.sh rename to examples/aishell/local/data.sh index b874b2df8..6eeb3d8fc 100644 --- a/examples/aishell/local/run_data.sh +++ b/examples/aishell/local/data.sh @@ -2,10 +2,13 @@ mkdir -p data +TARGET_DIR=${MAIN_ROOT}/examples/dataset +mkdir -p ${TARGET_DIR} + # download data, generate manifests -PYTHONPATH=.:$PYTHONPATH python3 local/aishell.py \ +PYTHONPATH=.:$PYTHONPATH python3 ${TARGET_DIR}/aishell/aishell.py \ --manifest_prefix="data/manifest" \ ---target_dir="${MAIN_ROOT}/dataset/aishell" +--target_dir="${TARGET_DIR}/aishell" if [ $? -ne 0 ]; then echo "Prepare Aishell failed. Terminated." @@ -14,7 +17,7 @@ fi # build vocabulary -python3 ${MAIN_ROOT}/tools/build_vocab.py \ +python3 ${MAIN_ROOT}/utils/build_vocab.py \ --count_threshold=0 \ --vocab_path="data/vocab.txt" \ --manifest_paths "data/manifest.train" "data/manifest.dev" @@ -26,7 +29,7 @@ fi # compute mean and stddev for normalizer -python3 ${MAIN_ROOT}/tools/compute_mean_std.py \ +python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train" \ --num_samples=2000 \ --specgram_type="linear" \ diff --git a/models/lm/download_lm_ch.sh b/examples/aishell/local/download_lm_ch.sh similarity index 73% rename from models/lm/download_lm_ch.sh rename to examples/aishell/local/download_lm_ch.sh index 0e4915262..f9e2261fd 100644 --- a/models/lm/download_lm_ch.sh +++ b/examples/aishell/local/download_lm_ch.sh @@ -1,10 +1,13 @@ #! /usr/bin/env bash -. ../../utils/utility.sh +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/lm +mkdir -p ${DIR} URL='https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm' MD5="29e02312deb2e59b3c8686c7966d4fe3" -TARGET=./zh_giga.no_cna_cmn.prune01244.klm +TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm echo "Download language model ..." diff --git a/models/aishell/download_model.sh b/examples/aishell/local/download_model.sh similarity index 68% rename from models/aishell/download_model.sh rename to examples/aishell/local/download_model.sh index 76ac4d005..2f9f40fb3 100644 --- a/models/aishell/download_model.sh +++ b/examples/aishell/local/download_model.sh @@ -1,10 +1,13 @@ #! /usr/bin/env bash -. ../../utils/utility.sh +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/pretrain +mkdir -p ${DIR} URL='https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_fluid.tar.gz' MD5=2bf0cc8b6d5da2a2a787b5cc36a496b5 -TARGET=./aishell_model_fluid.tar.gz +TARGET=${DIR}/aishell_model_fluid.tar.gz echo "Download Aishell model ..." @@ -13,7 +16,7 @@ if [ $? -ne 0 ]; then echo "Fail to download Aishell model!" exit 1 fi -tar -zxvf $TARGET +tar -zxvf $TARGET -C ${DIR} exit 0 diff --git a/examples/aishell/local/export.sh b/examples/aishell/local/export.sh new file mode 100644 index 000000000..1b5533916 --- /dev/null +++ b/examples/aishell/local/export.sh @@ -0,0 +1,20 @@ +#! /usr/bin/env bash + +if [ $# != 2 ];then + echo "usage: export ckpt_path jit_model_path" + exit -1 +fi + +python3 -u ${BIN_DIR}/export.py \ +--config conf/deepspeech2.yaml \ +--checkpoint_path ${1} \ +--export_path ${2} + + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + + +exit 0 diff --git a/examples/aishell/local/infer.sh b/examples/aishell/local/infer.sh new file mode 100644 index 000000000..4b4c9381b --- /dev/null +++ b/examples/aishell/local/infer.sh @@ -0,0 +1,23 @@ +#! /usr/bin/env bash + + +# download language model +bash local/download_lm_ch.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +python3 -u ${BIN_DIR}/infer.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--checkpoint_path ${1} + + +if [ $? -ne 0 ]; then + echo "Failed in inference!" + exit 1 +fi + + +exit 0 diff --git a/examples/aishell/local/infer_golden.sh b/examples/aishell/local/infer_golden.sh new file mode 100644 index 000000000..3fdcd1b5e --- /dev/null +++ b/examples/aishell/local/infer_golden.sh @@ -0,0 +1,31 @@ +#! /usr/bin/env bash + +# download language model +bash local/download_lm_ch.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +# download well-trained model +bash local/download_model.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +# infer +CUDA_VISIBLE_DEVICES=0 \ +python3 -u ${BIN_DIR}/infer.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--checkpoint_path data/pretrain/params.pdparams \ +--opts data.mean_std_filepath data/pretrain/mean_std.npz \ +--opts data.vocab_filepath data/pretrain/vocab.txt + +if [ $? -ne 0 ]; then + echo "Failed in inference!" + exit 1 +fi + + +exit 0 diff --git a/examples/aishell/local/run_infer.sh b/examples/aishell/local/run_infer.sh deleted file mode 100644 index 90be581be..000000000 --- a/examples/aishell/local/run_infer.sh +++ /dev/null @@ -1,44 +0,0 @@ -#! /usr/bin/env bash - - -# download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_ch.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# infer -CUDA_VISIBLE_DEVICES=0 \ -python3 -u ${MAIN_ROOT}/infer.py \ ---num_samples=10 \ ---beam_size=300 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=1024 \ ---alpha=2.6 \ ---beta=5.0 \ ---cutoff_prob=0.99 \ ---cutoff_top_n=40 \ ---use_gru=True \ ---use_gpu=True \ ---share_rnn_weights=False \ ---infer_manifest="data/manifest.test" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---model_path="checkpoints/step_final" \ ---lang_model_path="${MAIN_ROOT}/models/lm/zh_giga.no_cna_cmn.prune01244.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="cer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in inference!" - exit 1 -fi - - -exit 0 diff --git a/examples/aishell/local/run_infer_golden.sh b/examples/aishell/local/run_infer_golden.sh deleted file mode 100644 index 296c0d5b4..000000000 --- a/examples/aishell/local/run_infer_golden.sh +++ /dev/null @@ -1,52 +0,0 @@ -#! /usr/bin/env bash - -# download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_ch.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# download well-trained model -cd ${MAIN_ROOT}/models/aishell > /dev/null -bash download_model.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# infer -CUDA_VISIBLE_DEVICES=0 \ -python3 -u ${MAIN_ROOT}/infer.py \ ---num_samples=10 \ ---beam_size=300 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=1024 \ ---alpha=2.6 \ ---beta=5.0 \ ---cutoff_prob=0.99 \ ---cutoff_top_n=40 \ ---use_gru=True \ ---use_gpu=False \ ---share_rnn_weights=False \ ---infer_manifest="data/manifest.test" \ ---mean_std_path="${MAIN_ROOT}/models/aishell/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/aishell/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/aishell" \ ---lang_model_path="${MAIN_ROOT}/models/lm/zh_giga.no_cna_cmn.prune01244.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="cer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in inference!" - exit 1 -fi - - -exit 0 diff --git a/examples/aishell/local/run_test.sh b/examples/aishell/local/run_test.sh deleted file mode 100644 index d2dbfb4f0..000000000 --- a/examples/aishell/local/run_test.sh +++ /dev/null @@ -1,43 +0,0 @@ -#! /usr/bin/env bash - -# download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_ch.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# evaluate model -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u ${MAIN_ROOT}/test.py \ ---batch_size=128 \ ---beam_size=300 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=1024 \ ---alpha=2.6 \ ---beta=5.0 \ ---cutoff_prob=0.99 \ ---cutoff_top_n=40 \ ---use_gru=True \ ---use_gpu=True \ ---share_rnn_weights=False \ ---test_manifest="data/manifest.test" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---model_path="checkpoints/step_final" \ ---lang_model_path="${MAIN_ROOT}/models/lm/zh_giga.no_cna_cmn.prune01244.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="cer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in evaluation!" - exit 1 -fi - - -exit 0 diff --git a/examples/aishell/local/run_test_golden.sh b/examples/aishell/local/run_test_golden.sh deleted file mode 100644 index 062a1b99b..000000000 --- a/examples/aishell/local/run_test_golden.sh +++ /dev/null @@ -1,52 +0,0 @@ -#! /usr/bin/env bash - -# download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_ch.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# download well-trained model -cd ${MAIN_ROOT}/models/aishell > /dev/null -bash download_model.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# evaluate model -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u ${MAIN_ROOT}/test.py \ ---batch_size=128 \ ---beam_size=300 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=1024 \ ---alpha=2.6 \ ---beta=5.0 \ ---cutoff_prob=0.99 \ ---cutoff_top_n=40 \ ---use_gru=True \ ---use_gpu=True \ ---share_rnn_weights=False \ ---test_manifest="data/manifest.test" \ ---mean_std_path="${MAIN_ROOT}/models/aishell/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/aishell/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/aishell" \ ---lang_model_path="${MAIN_ROOT}/models/lm/zh_giga.no_cna_cmn.prune01244.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="cer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in evaluation!" - exit 1 -fi - - -exit 0 diff --git a/examples/aishell/local/run_train.sh b/examples/aishell/local/run_train.sh deleted file mode 100644 index 5bde13721..000000000 --- a/examples/aishell/local/run_train.sh +++ /dev/null @@ -1,40 +0,0 @@ -#! /usr/bin/env bash - -# train model -# if you wish to resume from an exists model, uncomment --init_from_pretrained_model -export FLAGS_sync_nccl_allreduce=0 -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u ${MAIN_ROOT}/train.py \ ---batch_size=64 \ ---num_epoch=50 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=1024 \ ---num_iter_print=100 \ ---save_epoch=1 \ ---num_samples=120000 \ ---learning_rate=5e-4 \ ---max_duration=27.0 \ ---min_duration=0.0 \ ---test_off=False \ ---use_sortagrad=True \ ---use_gru=True \ ---use_gpu=True \ ---is_local=True \ ---share_rnn_weights=False \ ---train_manifest="data/manifest.train" \ ---dev_manifest="data/manifest.dev" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---output_model_dir="./checkpoints" \ ---augment_conf_path="${MAIN_ROOT}/conf/augmentation.config" \ ---specgram_type="linear" \ ---shuffle_method="batch_shuffle_clipped" \ - -if [ $? -ne 0 ]; then - echo "Failed in training!" - exit 1 -fi - - -exit 0 diff --git a/examples/aishell/local/server.sh b/examples/aishell/local/server.sh new file mode 100644 index 000000000..379684075 --- /dev/null +++ b/examples/aishell/local/server.sh @@ -0,0 +1,40 @@ +#! /usr/bin/env bash +# TODO: replace the model with a mandarin model + +if [[ $# != 1 ]];then + echo "usage: server.sh checkpoint_path" + exit -1 +fi + +source path.sh + +# download language model +bash local/download_lm_ch.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +# download well-trained model +bash local/download_model.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +# start demo server +CUDA_VISIBLE_DEVICES=0 \ +python3 -u ${BIN_DIR}/deploy/server.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--host_ip="localhost" \ +--host_port=8086 \ +--speech_save_dir="demo_cache" \ +--checkpoint_path ${1} + +if [ $? -ne 0 ]; then + echo "Failed in starting demo server!" + exit 1 +fi + + +exit 0 diff --git a/examples/aishell/local/test.sh b/examples/aishell/local/test.sh new file mode 100644 index 000000000..74015f5d5 --- /dev/null +++ b/examples/aishell/local/test.sh @@ -0,0 +1,21 @@ +#! /usr/bin/env bash + +# download language model +bash local/download_lm_ch.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +python3 -u ${BIN_DIR}/test.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--checkpoint_path ${1} + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + + +exit 0 diff --git a/examples/aishell/local/test_golden.sh b/examples/aishell/local/test_golden.sh new file mode 100644 index 000000000..86abd38cb --- /dev/null +++ b/examples/aishell/local/test_golden.sh @@ -0,0 +1,31 @@ +#! /usr/bin/env bash + +# download language model +bash local/download_lm_ch.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +# download well-trained model +bash local/download_model.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +# evaluate model +CUDA_VISIBLE_DEVICES=0 \ +python3 -u ${BIN_DIR}/test.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--checkpoint_path data/pretrain/params.pdparams \ +--opts data.mean_std_filepath data/pretrain/mean_std.npz \ +--opts data.vocab_filepath data/pretrain/vocab.txt + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + + +exit 0 diff --git a/examples/aishell/local/train.sh b/examples/aishell/local/train.sh new file mode 100644 index 000000000..3e13a79e3 --- /dev/null +++ b/examples/aishell/local/train.sh @@ -0,0 +1,23 @@ +#! /usr/bin/env bash + +# train model +# if you wish to resume from an exists model, uncomment --init_from_pretrained_model +export FLAGS_sync_nccl_allreduce=0 + +ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));') +echo "using $ngpu gpus..." + +python3 -u ${BIN_DIR}/train.py \ +--device 'gpu' \ +--nproc ${ngpu} \ +--config conf/deepspeech2.yaml \ +--output ckpt + + +if [ $? -ne 0 ]; then + echo "Failed in training!" + exit 1 +fi + + +exit 0 diff --git a/examples/aishell/local/tune.sh b/examples/aishell/local/tune.sh new file mode 100644 index 000000000..9ff5e8b99 --- /dev/null +++ b/examples/aishell/local/tune.sh @@ -0,0 +1,28 @@ +#! /usr/bin/env bash + +# grid-search for hyper-parameters in language model +python3 -u ${BIN_DIR}/tune.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--num_batches=10 \ +--batch_size=128 \ +--beam_size=300 \ +--num_proc_bsearch=8 \ +--num_alphas=10 \ +--num_betas=10 \ +--alpha_from=0.0 \ +--alpha_to=5.0 \ +--beta_from=-6 \ +--beta_to=6 \ +--cutoff_prob=1.0 \ +--cutoff_top_n=40 \ +--checkpoint_path ${1} + +if [ $? -ne 0 ]; then + echo "Failed in tuning!" + exit 1 +fi + + +exit 0 diff --git a/examples/aishell/path.sh b/examples/aishell/path.sh index fd1cebba8..debdbba46 100644 --- a/examples/aishell/path.sh +++ b/examples/aishell/path.sh @@ -6,3 +6,8 @@ export LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + +MODEL=deepspeech2 +export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin diff --git a/examples/aishell/run.sh b/examples/aishell/run.sh index 93bf86388..dc762df99 100644 --- a/examples/aishell/run.sh +++ b/examples/aishell/run.sh @@ -1,21 +1,16 @@ #!/bin/bash source path.sh +# only demos # prepare data -bash ./local/run_data.sh - -# test pretrain model -bash ./local/run_test_golden.sh - -# test pretain model -bash ./local/run_infer_golden.sh +bash ./local/data.sh # train model -bash ./local/run_train.sh +CUDA_VISIBLE_DEVICES=0,1,2,3 bash ./local/train.sh # test model -bash ./local/run_test.sh +CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ckpt/checkpoints/step-3284 # infer model -bash ./local/run_infer.sh +CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh ckpt/checkpoints/step-3284 diff --git a/examples/aug_conf/augmentation.config b/examples/aug_conf/augmentation.config new file mode 100644 index 000000000..6c24da549 --- /dev/null +++ b/examples/aug_conf/augmentation.config @@ -0,0 +1,8 @@ +[ + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + } +] diff --git a/conf/augmentation.config.example b/examples/aug_conf/augmentation.config.example similarity index 100% rename from conf/augmentation.config.example rename to examples/aug_conf/augmentation.config.example diff --git a/examples/baidu_en8k/run_infer_golden.sh b/examples/baidu_en8k/run_infer_golden.sh deleted file mode 100644 index 11d7541ee..000000000 --- a/examples/baidu_en8k/run_infer_golden.sh +++ /dev/null @@ -1,54 +0,0 @@ -#! /usr/bin/env bash - -source path.sh - -# download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_en.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# download well-trained model -cd ${MAIN_ROOT}/models/baidu_en8k > /dev/null -bash download_model.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# infer -CUDA_VISIBLE_DEVICES=0 \ -python3 -u ${MAIN_ROOT}/infer.py \ ---num_samples=10 \ ---beam_size=500 \ ---num_proc_bsearch=5 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=1024 \ ---alpha=1.4 \ ---beta=0.35 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=True \ ---use_gpu=False \ ---share_rnn_weights=False \ ---infer_manifest="${MAIN_ROOT}/examples/librispeech/data/manifest.test-clean" \ ---mean_std_path="${MAIN_ROOT}/models/baidu_en8k/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/baidu_en8k/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/baidu_en8k" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="wer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in inference!" - exit 1 -fi - - -exit 0 diff --git a/examples/baidu_en8k/run_test_golden.sh b/examples/baidu_en8k/run_test_golden.sh deleted file mode 100644 index 10c61a096..000000000 --- a/examples/baidu_en8k/run_test_golden.sh +++ /dev/null @@ -1,54 +0,0 @@ -#! /usr/bin/env bash - -source path.sh - -# download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_en.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# download well-trained model -cd ${MAIN_ROOT}/models/baidu_en8k > /dev/null -bash download_model.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# evaluate model -CUDA_VISIBLE_DEVICES=0,1,2,3 \ -python3 -u ${MAIN_ROOT}/test.py \ ---batch_size=128 \ ---beam_size=500 \ ---num_proc_bsearch=8 \ ---num_proc_data=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=1024 \ ---alpha=1.4 \ ---beta=0.35 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=True \ ---use_gpu=False \ ---share_rnn_weights=False \ ---test_manifest="data/manifest.test-clean" \ ---mean_std_path="${MAIN_ROOT}/models/baidu_en8k/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/baidu_en8k/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/baidu_en8k" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="wer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in evaluation!" - exit 1 -fi - -exit 0 diff --git a/examples/dataset/aishell/.gitignore b/examples/dataset/aishell/.gitignore new file mode 100644 index 000000000..9c6e517e5 --- /dev/null +++ b/examples/dataset/aishell/.gitignore @@ -0,0 +1 @@ +data_aishell* diff --git a/examples/aishell/local/aishell.py b/examples/dataset/aishell/aishell.py similarity index 98% rename from examples/aishell/local/aishell.py rename to examples/dataset/aishell/aishell.py index ba59b744d..38d0c28a3 100644 --- a/examples/aishell/local/aishell.py +++ b/examples/dataset/aishell/aishell.py @@ -24,7 +24,7 @@ import codecs import soundfile import json import argparse -from data_utils.utility import download, unpack +from utils.utility import download, unpack DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') diff --git a/data/noise/chime3_background.py b/examples/dataset/chime3_background/chime3_background.py similarity index 97% rename from data/noise/chime3_background.py rename to examples/dataset/chime3_background/chime3_background.py index 8db09204e..31208d147 100644 --- a/data/noise/chime3_background.py +++ b/examples/dataset/chime3_background/chime3_background.py @@ -29,7 +29,8 @@ import json import io from paddle.v2.dataset.common import md5file -DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') +#DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') +DATA_HOME = os.path.expanduser('.') URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ" MD5 = "c3ff512618d7a67d4f85566ea1bc39ec" diff --git a/examples/dataset/librispeech/.gitignore b/examples/dataset/librispeech/.gitignore new file mode 100644 index 000000000..a8d8eb76d --- /dev/null +++ b/examples/dataset/librispeech/.gitignore @@ -0,0 +1,7 @@ +dev-clean/ +dev-other/ +test-clean/ +test-other/ +train-clean-100/ +train-clean-360/ +train-other-500/ diff --git a/examples/librispeech/local/librispeech.py b/examples/dataset/librispeech/librispeech.py similarity index 98% rename from examples/librispeech/local/librispeech.py rename to examples/dataset/librispeech/librispeech.py index ae1bae2de..4cf0f5541 100644 --- a/examples/librispeech/local/librispeech.py +++ b/examples/dataset/librispeech/librispeech.py @@ -27,10 +27,10 @@ import soundfile import json import codecs import io -from data_utils.utility import download, unpack +from utils.utility import download, unpack URL_ROOT = "http://www.openslr.org/resources/12" -URL_ROOT = "https://openslr.magicdatatech.com/resources/12" +#URL_ROOT = "https://openslr.magicdatatech.com/resources/12" URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz" URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz" URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz" diff --git a/examples/dataset/mini_librispeech/.gitignore b/examples/dataset/mini_librispeech/.gitignore new file mode 100644 index 000000000..61f54c966 --- /dev/null +++ b/examples/dataset/mini_librispeech/.gitignore @@ -0,0 +1,4 @@ +dev-clean/ +manifest.dev-clean +manifest.train-clean +train-clean/ diff --git a/examples/dataset/mini_librispeech/mini_librispeech.py b/examples/dataset/mini_librispeech/mini_librispeech.py new file mode 100644 index 000000000..883a322dc --- /dev/null +++ b/examples/dataset/mini_librispeech/mini_librispeech.py @@ -0,0 +1,115 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Librispeech ASR datasets. + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" + +import distutils.util +import os +import sys +import argparse +import soundfile +import json +import codecs +import io +from utils.utility import download, unpack + +URL_ROOT = "http://www.openslr.org/resources/31" +URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz" +URL_DEV_CLEAN = URL_ROOT + "/dev-clean-2.tar.gz" + +MD5_TRAIN_CLEAN = "5df7d4e78065366204ca6845bb08f490" +MD5_DEV_CLEAN = "6d7ab67ac6a1d2c993d050e16d61080d" + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default='~/.cache/paddle/dataset/speech/libri', + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path): + """Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. + """ + print("Creating manifest %s ..." % manifest_path) + json_lines = [] + for subfolder, _, filelist in sorted(os.walk(data_dir)): + text_filelist = [ + filename for filename in filelist if filename.endswith('trans.txt') + ] + if len(text_filelist) > 0: + text_filepath = os.path.join(subfolder, text_filelist[0]) + for line in io.open(text_filepath, encoding="utf8"): + segments = line.strip().split() + text = ' '.join(segments[1:]).lower() + audio_filepath = os.path.join(subfolder, segments[0] + '.flac') + audio_data, samplerate = soundfile.read(audio_filepath) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps({ + 'audio_filepath': audio_filepath, + 'duration': duration, + 'text': text + })) + with codecs.open(manifest_path, 'w', 'utf-8') as out_file: + for line in json_lines: + out_file.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """Download, unpack and create summmary manifest file. + """ + if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + # download + filepath = download(url, md5sum, target_dir) + # unpack + unpack(filepath, target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file + create_manifest(target_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=URL_TRAIN_CLEAN, + md5sum=MD5_TRAIN_CLEAN, + target_dir=os.path.join(args.target_dir, "train-clean"), + manifest_path=args.manifest_prefix + ".train-clean") + prepare_dataset( + url=URL_DEV_CLEAN, + md5sum=MD5_DEV_CLEAN, + target_dir=os.path.join(args.target_dir, "dev-clean"), + manifest_path=args.manifest_prefix + ".dev-clean") + + +if __name__ == '__main__': + main() diff --git a/examples/dataset/musan/.gitignore b/examples/dataset/musan/.gitignore new file mode 100644 index 000000000..3f0d0616a --- /dev/null +++ b/examples/dataset/musan/.gitignore @@ -0,0 +1,5 @@ +manifest.music +manifest.noise +manifest.speech +musan/ +musan.tar.gz diff --git a/examples/dataset/musan/musan.py b/examples/dataset/musan/musan.py new file mode 100644 index 000000000..87d8e5e10 --- /dev/null +++ b/examples/dataset/musan/musan.py @@ -0,0 +1,105 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Aishell mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import codecs +import soundfile +import json +import argparse +from utils.utility import download, unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'https://www.openslr.org/resources/17' +DATA_URL = URL_ROOT + '/musan.tar.gz' +MD5_DATA = '0c472d4fc0c5141eca47ad1ffeb2a7df' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/musan", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + data_types = ['music', 'noise', 'speech'] + for type in data_types: + del json_lines[:] + audio_dir = os.path.join(data_dir, type) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + print('x, ', subfolder) + for fname in filelist: + audio_path = os.path.join(subfolder, fname) + if not audio_path.endswith('.wav'): + continue + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + json_lines.append( + json.dumps( + { + 'audio_filepath': audio_path, + 'duration': duration, + 'type': type, + }, + ensure_ascii=False)) + manifest_path = manifest_path_prefix + '.' + type + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, 'musan') + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + create_manifest(data_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix) + + +if __name__ == '__main__': + main() diff --git a/examples/dataset/rir_noise/.gitignore b/examples/dataset/rir_noise/.gitignore new file mode 100644 index 000000000..eb7588824 --- /dev/null +++ b/examples/dataset/rir_noise/.gitignore @@ -0,0 +1,5 @@ +RIRS_NOISES/ +manifest.pointsource_noises +manifest.real_rirs_isotropic_noises +manifest.simulated_rirs +rirs_noises.zip diff --git a/examples/dataset/rir_noise/rir_noise.py b/examples/dataset/rir_noise/rir_noise.py new file mode 100644 index 000000000..643540c9b --- /dev/null +++ b/examples/dataset/rir_noise/rir_noise.py @@ -0,0 +1,106 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Aishell mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import codecs +import soundfile +import json +import argparse +from utils.utility import download, unpack, unzip + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'http://www.openslr.org/resources/28' +DATA_URL = URL_ROOT + '/rirs_noises.zip' +MD5_DATA = 'e6f48e257286e05de56413b4779d8ffb' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/Aishell", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + data_types = [ + 'pointsource_noises', 'real_rirs_isotropic_noises', 'simulated_rirs' + ] + for type in data_types: + del json_lines[:] + audio_dir = os.path.join(data_dir, type) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + audio_path = os.path.join(subfolder, fname) + if not audio_path.endswith('.wav'): + continue + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + json_lines.append( + json.dumps( + { + 'audio_filepath': audio_path, + 'duration': duration, + 'type': type, + }, + ensure_ascii=False)) + manifest_path = manifest_path_prefix + '.' + type + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """Download, unzip and create manifest file.""" + data_dir = os.path.join(target_dir, 'RIRS_NOISES') + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unzip(filepath, target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + create_manifest(data_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix) + + +if __name__ == '__main__': + main() diff --git a/data/voxforge/run_data.sh b/examples/dataset/voxforge/run_data.sh similarity index 58% rename from data/voxforge/run_data.sh rename to examples/dataset/voxforge/run_data.sh index 0276744ae..5af9d0cc6 100644 --- a/data/voxforge/run_data.sh +++ b/examples/dataset/voxforge/run_data.sh @@ -1,9 +1,12 @@ #! /usr/bin/env bash +TARGET_DIR=${MAIN_ROOT}/examples/dataset/voxforge +mkdir -p ${TARGET_DIR} + # download data, generate manifests -PYTHONPATH=../../:$PYTHONPATH python voxforge.py \ ---manifest_prefix='./manifest' \ ---target_dir='./dataset/VoxForge' \ +python ${MAIN_ROOT}/examples/dataset/voxforge/voxforge.py \ +--manifest_prefix="${TARGET_DIR}/manifest" \ +--target_dir="${TARGET_DIR}" \ --is_merge_dialect=True \ --dialects 'american' 'british' 'australian' 'european' 'irish' 'canadian' 'indian' diff --git a/data/voxforge/voxforge.py b/examples/dataset/voxforge/voxforge.py similarity index 98% rename from data/voxforge/voxforge.py rename to examples/dataset/voxforge/voxforge.py index 3fb0ded88..abf1ccff6 100644 --- a/data/voxforge/voxforge.py +++ b/examples/dataset/voxforge/voxforge.py @@ -27,9 +27,9 @@ import json import argparse import shutil import subprocess -from data_utils.utility import download_multi, unpack, getfile_insensitive +from utils.utility import download_multi, unpack, getfile_insensitive -DATA_HOME = './dataset' +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') DATA_URL = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/' \ 'Audio/Main/16kHz_16bit' diff --git a/examples/deploy_demo/path.sh b/examples/deploy_demo/path.sh deleted file mode 100644 index fd1cebba8..000000000 --- a/examples/deploy_demo/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export MAIN_ROOT=${PWD}/../../ - -export PATH=${MAIN_ROOT}:${PWD}/tools:${PATH} -export LC_ALL=C - -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/deploy_demo/run_english_demo_server.sh b/examples/deploy_demo/run_english_demo_server.sh deleted file mode 100644 index ae092dbce..000000000 --- a/examples/deploy_demo/run_english_demo_server.sh +++ /dev/null @@ -1,54 +0,0 @@ -#! /usr/bin/env bash -# TODO: replace the model with a mandarin model - -source path.sh - -# download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_en.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# download well-trained model -cd ${MAIN_ROOT}/models/baidu_en8k > /dev/null -bash download_model.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# start demo server -CUDA_VISIBLE_DEVICES=0 \ -python3 -u ${MAIN_ROOT}/deploy/demo_server.py \ ---host_ip="localhost" \ ---host_port=8086 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=1024 \ ---alpha=1.15 \ ---beta=0.15 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=True \ ---use_gpu=True \ ---share_rnn_weights=False \ ---speech_save_dir="demo_cache" \ ---warmup_manifest="${MAIN_ROOT}/examples/tiny/data/manifest.test-clean" \ ---mean_std_path="${MAIN_ROOT}/models/baidu_en8k/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/baidu_en8k/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/baidu_en8k" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ ---decoding_method="ctc_beam_search" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in starting demo server!" - exit 1 -fi - - -exit 0 diff --git a/examples/librispeech/.gitignore b/examples/librispeech/.gitignore new file mode 100644 index 000000000..44038ca5b --- /dev/null +++ b/examples/librispeech/.gitignore @@ -0,0 +1,2 @@ +data +ckpt* diff --git a/examples/librispeech/conf/augmentation.config b/examples/librispeech/conf/augmentation.config new file mode 100644 index 000000000..6c24da549 --- /dev/null +++ b/examples/librispeech/conf/augmentation.config @@ -0,0 +1,8 @@ +[ + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + } +] diff --git a/examples/librispeech/conf/deepspeech2.yaml b/examples/librispeech/conf/deepspeech2.yaml new file mode 100644 index 000000000..15fd4cbe3 --- /dev/null +++ b/examples/librispeech/conf/deepspeech2.yaml @@ -0,0 +1,47 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev-clean + test_manifest: data/manifest.test-clean + mean_std_filepath: data/mean_std.npz + vocab_filepath: data/vocab.txt + augmentation_config: conf/augmentation.config + batch_size: 20 + max_duration: 27.0 + min_duration: 0.0 + specgram_type: linear + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 20.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 +model: + num_conv_layers: 2 + num_rnn_layers: 3 + rnn_layer_size: 2048 + use_gru: False + share_rnn_weights: True +training: + n_epoch: 20 + lr: 5e-4 + lr_decay: 0.83 + weight_decay: 1e-06 + global_grad_clip: 5.0 +decoding: + batch_size: 128 + error_rate_type: wer + decoding_method: ctc_beam_search + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 2.5 + beta: 0.3 + beam_size: 500 + cutoff_prob: 1.0 + cutoff_top_n: 40 + num_proc_bsearch: 8 diff --git a/examples/librispeech/local/run_data.sh b/examples/librispeech/local/data.sh similarity index 73% rename from examples/librispeech/local/run_data.sh rename to examples/librispeech/local/data.sh index cbcad7b8d..ca65d640c 100644 --- a/examples/librispeech/local/run_data.sh +++ b/examples/librispeech/local/data.sh @@ -1,11 +1,13 @@ #! /usr/bin/env bash mkdir -p data +TARGET_DIR=${MAIN_ROOT}/examples/dataset +mkdir -p ${TARGET_DIR} # download data, generate manifests -PYTHONPATH=.:$PYTHONPATH python3 local/librispeech.py \ +PYTHONPATH=.:$PYTHONPATH python3 ${TARGET_DIR}/librispeech/librispeech.py \ --manifest_prefix="data/manifest" \ ---target_dir="${MAIN_ROOT}/dataset/librispeech" \ +--target_dir="${TARGET_DIR}/librispeech" \ --full_download="True" if [ $? -ne 0 ]; then @@ -15,9 +17,8 @@ fi cat data/manifest.train-* | shuf > data/manifest.train - # build vocabulary -python3 ${MAIN_ROOT}/tools/build_vocab.py \ +python3 ${MAIN_ROOT}/utils/build_vocab.py \ --count_threshold=0 \ --vocab_path="data/vocab.txt" \ --manifest_paths="data/manifest.train" @@ -27,9 +28,8 @@ if [ $? -ne 0 ]; then exit 1 fi - # compute mean and stddev for normalizer -python3 ${MAIN_ROOT}/tools/compute_mean_std.py \ +python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train" \ --num_samples=2000 \ --specgram_type="linear" \ @@ -40,6 +40,5 @@ if [ $? -ne 0 ]; then exit 1 fi - echo "LibriSpeech Data preparation done." exit 0 diff --git a/models/lm/download_lm_en.sh b/examples/librispeech/local/download_lm_en.sh similarity index 73% rename from models/lm/download_lm_en.sh rename to examples/librispeech/local/download_lm_en.sh index cc8d32035..05ea793fb 100644 --- a/models/lm/download_lm_en.sh +++ b/examples/librispeech/local/download_lm_en.sh @@ -1,11 +1,13 @@ #! /usr/bin/env bash -. ../../utils/utility.sh +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/lm +mkdir -p ${DIR} URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm MD5="099a601759d467cd0a8523ff939819c5" -TARGET=./common_crawl_00.prune01111.trie.klm - +TARGET=${DIR}/common_crawl_00.prune01111.trie.klm echo "Download language model ..." download $URL $MD5 $TARGET diff --git a/models/librispeech/download_model.sh b/examples/librispeech/local/download_model.sh similarity index 68% rename from models/librispeech/download_model.sh rename to examples/librispeech/local/download_model.sh index edf853054..f13bde0f2 100644 --- a/models/librispeech/download_model.sh +++ b/examples/librispeech/local/download_model.sh @@ -1,10 +1,13 @@ #! /usr/bin/env bash -. ../../utils/utility.sh +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/pretrain +mkdir -p ${DIR} URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz' MD5=fafb11fe57c3ecd107147056453f5348 -TARGET=./librispeech_model_fluid.tar.gz +TARGET=${DIR}/librispeech_model_fluid.tar.gz echo "Download LibriSpeech model ..." @@ -13,7 +16,6 @@ if [ $? -ne 0 ]; then echo "Fail to download LibriSpeech model!" exit 1 fi -tar -zxvf $TARGET - +tar -zxvf $TARGET -C ${DIR} exit 0 diff --git a/examples/librispeech/local/infer.sh b/examples/librispeech/local/infer.sh new file mode 100644 index 000000000..9ea39901f --- /dev/null +++ b/examples/librispeech/local/infer.sh @@ -0,0 +1,20 @@ +#! /usr/bin/env bash + +# download language model +bash local/download_lm_en.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +python3 -u ${BIN_DIR}/infer.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--checkpoint_path ${1} + +if [ $? -ne 0 ]; then + echo "Failed in inference!" + exit 1 +fi + +exit 0 diff --git a/examples/librispeech/local/run_infer.sh b/examples/librispeech/local/run_infer.sh deleted file mode 100644 index 33959b381..000000000 --- a/examples/librispeech/local/run_infer.sh +++ /dev/null @@ -1,43 +0,0 @@ -#! /usr/bin/env bash - -# download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_en.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# infer -CUDA_VISIBLE_DEVICES=0 \ -python3 -u ${MAIN_ROOT}/infer.py \ ---num_samples=10 \ ---beam_size=500 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---alpha=2.5 \ ---beta=0.3 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---infer_manifest="data/manifest.test-clean" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---model_path="checkpoints/step_final" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="wer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in inference!" - exit 1 -fi - - -exit 0 diff --git a/examples/librispeech/local/run_infer_golden.sh b/examples/librispeech/local/run_infer_golden.sh deleted file mode 100644 index 21663681b..000000000 --- a/examples/librispeech/local/run_infer_golden.sh +++ /dev/null @@ -1,52 +0,0 @@ -#! /usr/bin/env bash - -# download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_en.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# download well-trained model -cd ${MAIN_ROOT}/models/librispeech > /dev/null -bash download_model.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# infer -CUDA_VISIBLE_DEVICES=0 \ -python3 -u ${MAIN_ROOT}/infer.py \ ---num_samples=10 \ ---beam_size=500 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---alpha=2.5 \ ---beta=0.3 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---infer_manifest="data/manifest.test-clean" \ ---mean_std_path="${MAIN_ROOT}/models/librispeech/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/librispeech/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/librispeech" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="wer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in inference!" - exit 1 -fi - - -exit 0 diff --git a/examples/librispeech/local/run_test.sh b/examples/librispeech/local/run_test.sh deleted file mode 100644 index cd8c07542..000000000 --- a/examples/librispeech/local/run_test.sh +++ /dev/null @@ -1,43 +0,0 @@ -#! /usr/bin/env bash - -# download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_en.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# evaluate model -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u ${MAIN_ROOT}/test.py \ ---batch_size=128 \ ---beam_size=500 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---alpha=2.5 \ ---beta=0.3 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---test_manifest="data/manifest.test-clean" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---model_path="checkpoints/step_final" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="wer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in evaluation!" - exit 1 -fi - - -exit 0 diff --git a/examples/librispeech/local/run_test_golden.sh b/examples/librispeech/local/run_test_golden.sh deleted file mode 100644 index 54ec6ad03..000000000 --- a/examples/librispeech/local/run_test_golden.sh +++ /dev/null @@ -1,52 +0,0 @@ -#! /usr/bin/env bash - -# download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_en.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# download well-trained model -cd ${MAIN_ROOT}/models/librispeech > /dev/null -bash download_model.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# evaluate model -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u ${MAIN_ROOT}/test.py \ ---batch_size=128 \ ---beam_size=500 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---alpha=2.5 \ ---beta=0.3 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---test_manifest="data/manifest.test-clean" \ ---mean_std_path="${MAIN_ROOT}/models/librispeech/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/librispeech/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/librispeech" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="wer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in evaluation!" - exit 1 -fi - - -exit 0 diff --git a/examples/librispeech/local/run_train.sh b/examples/librispeech/local/run_train.sh deleted file mode 100644 index 32aa2657b..000000000 --- a/examples/librispeech/local/run_train.sh +++ /dev/null @@ -1,41 +0,0 @@ -#! /usr/bin/env bash - -# train model -# if you wish to resume from an exists model, uncomment --init_from_pretrained_model -export FLAGS_sync_nccl_allreduce=0 - -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u ${MAIN_ROOT}/train.py \ ---batch_size=20 \ ---num_epoch=50 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---num_iter_print=100 \ ---save_epoch=1 \ ---num_samples=280000 \ ---learning_rate=5e-4 \ ---max_duration=27.0 \ ---min_duration=0.0 \ ---test_off=False \ ---use_sortagrad=True \ ---use_gru=False \ ---use_gpu=True \ ---is_local=True \ ---share_rnn_weights=True \ ---train_manifest="data/manifest.train" \ ---dev_manifest="data/manifest.dev-clean" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---output_model_dir="./checkpoints/libri" \ ---augment_conf_path="${MAIN_ROOT}/conf/augmentation.config" \ ---specgram_type="linear" \ ---shuffle_method="batch_shuffle_clipped" \ - -if [ $? -ne 0 ]; then - echo "Failed in training!" - exit 1 -fi - - -exit 0 diff --git a/examples/librispeech/local/run_tune.sh b/examples/librispeech/local/run_tune.sh deleted file mode 100644 index 848f0b8f9..000000000 --- a/examples/librispeech/local/run_tune.sh +++ /dev/null @@ -1,38 +0,0 @@ -#! /usr/bin/env bash - -# grid-search for hyper-parameters in language model -CUDA_VISIBLE_DEVICES=0,1,2,3 \ -python3 -u ${MAIN_ROOT}tools/tune.py \ ---num_batches=-1 \ ---batch_size=128 \ ---beam_size=500 \ ---num_proc_bsearch=12 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---num_alphas=45 \ ---num_betas=8 \ ---alpha_from=1.0 \ ---alpha_to=3.2 \ ---beta_from=0.1 \ ---beta_to=0.45 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---tune_manifest="data/manifest.dev-clean" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/librispeech/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/librispeech" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ ---error_rate_type="wer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in tuning!" - exit 1 -fi - - -exit 0 diff --git a/examples/librispeech/local/test.sh b/examples/librispeech/local/test.sh new file mode 100644 index 000000000..f39fbaef1 --- /dev/null +++ b/examples/librispeech/local/test.sh @@ -0,0 +1,22 @@ +#! /usr/bin/env bash + +# download language model +bash local/download_lm_en.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +python3 -u ${BIN_DIR}/test.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--output ckpt + + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + + +exit 0 diff --git a/examples/librispeech/local/train.sh b/examples/librispeech/local/train.sh new file mode 100644 index 000000000..507947e9e --- /dev/null +++ b/examples/librispeech/local/train.sh @@ -0,0 +1,22 @@ +#! /usr/bin/env bash + +export FLAGS_sync_nccl_allreduce=0 +# https://github.com/PaddlePaddle/Paddle/pull/28484 +export NCCL_SHM_DISABLE=1 + +ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));') +echo "using $ngpu gpus..." + +python3 -u ${BIN_DIR}/train.py \ +--device 'gpu' \ +--nproc ${ngpu} \ +--config conf/deepspeech2.yaml \ +--output ckpt + +if [ $? -ne 0 ]; then + echo "Failed in training!" + exit 1 +fi + + +exit 0 diff --git a/examples/librispeech/local/tune.sh b/examples/librispeech/local/tune.sh new file mode 100644 index 000000000..4bb81d29b --- /dev/null +++ b/examples/librispeech/local/tune.sh @@ -0,0 +1,33 @@ +#! /usr/bin/env bash + +if [ $# != 1 ];then + echo "usage: tune ckpt_path" + exit 1 +fi + +# grid-search for hyper-parameters in language model +python3 -u ${BIN_DIR}/tune.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--num_batches=-1 \ +--batch_size=128 \ +--beam_size=500 \ +--num_proc_bsearch=12 \ +--num_alphas=45 \ +--num_betas=8 \ +--alpha_from=1.0 \ +--alpha_to=3.2 \ +--beta_from=0.1 \ +--beta_to=0.45 \ +--cutoff_prob=1.0 \ +--cutoff_top_n=40 \ +--checkpoint_path ${1} + +if [ $? -ne 0 ]; then + echo "Failed in tuning!" + exit 1 +fi + + +exit 0 diff --git a/examples/librispeech/path.sh b/examples/librispeech/path.sh index fd1cebba8..a179631b3 100644 --- a/examples/librispeech/path.sh +++ b/examples/librispeech/path.sh @@ -6,3 +6,9 @@ export LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + + +MODEL=deepspeech2 +export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin diff --git a/examples/librispeech/run.sh b/examples/librispeech/run.sh index c8e589139..ff87d38bf 100644 --- a/examples/librispeech/run.sh +++ b/examples/librispeech/run.sh @@ -1,24 +1,16 @@ #!/bin/bash +set -e source path.sh # prepare data -bash ./local/run_data.sh - -# test pretrain model -bash ./local/run_test_golden.sh - -# test pretain model -bash ./local/run_infer_golden.sh +bash ./local/data.sh # train model -bash ./local/run_train.sh +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ./local/train.sh # test model -bash ./local/run_test.sh +CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh # infer model -bash ./local/run_infer.sh - -# tune model -bash ./local/run_tune.sh +CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh diff --git a/examples/tiny/.gitignore b/examples/tiny/.gitignore new file mode 100644 index 000000000..073c3b9eb --- /dev/null +++ b/examples/tiny/.gitignore @@ -0,0 +1,2 @@ +ckpt* +data diff --git a/examples/tiny/README.md b/examples/tiny/README.md index d7361b263..c3bfdc9c4 100644 --- a/examples/tiny/README.md +++ b/examples/tiny/README.md @@ -7,36 +7,39 @@ - Prepare the data ```bash - sh local/run_data.sh + bash local/data.sh ``` - `run_data.sh` will download dataset, generate manifests, collect normalizer's statistics and build vocabulary. Once the data preparation is done, you will find the data (only part of LibriSpeech) downloaded in `${MAIN_ROOT}/dataset/librispeech` and the corresponding manifest files generated in `${PWD}/data` as well as a mean stddev file and a vocabulary file. It has to be run for the very first time you run this dataset and is reusable for all further experiments. + `data.sh` will download dataset, generate manifests, collect normalizer's statistics and build vocabulary. Once the data preparation is done, you will find the data (only part of LibriSpeech) downloaded in `${MAIN_ROOT}/dataset/librispeech` and the corresponding manifest files generated in `${PWD}/data` as well as a mean stddev file and a vocabulary file. It has to be run for the very first time you run this dataset and is reusable for all further experiments. + - Train your own ASR model ```bash - sh local/run_train.sh + bash local/train.sh ``` - `run_train.sh` will start a training job, with training logs printed to stdout and model checkpoint of every pass/epoch saved to `${PWD}/checkpoints`. These checkpoints could be used for training resuming, inference, evaluation and deployment. + `train.sh` will start a training job, with training logs printed to stdout and model checkpoint of every pass/epoch saved to `${PWD}/checkpoints`. These checkpoints could be used for training resuming, inference, evaluation and deployment. + - Case inference with an existing model ```bash - sh local/run_infer.sh + bash local/infer.sh ``` - `run_infer.sh` will show us some speech-to-text decoding results for several (default: 10) samples with the trained model. The performance might not be good now as the current model is only trained with a toy subset of LibriSpeech. To see the results with a better model, you can download a well-trained (trained for several days, with the complete LibriSpeech) model and do the inference: + `infer.sh` will show us some speech-to-text decoding results for several (default: 10) samples with the trained model. The performance might not be good now as the current model is only trained with a toy subset of LibriSpeech. To see the results with a better model, you can download a well-trained (trained for several days, with the complete LibriSpeech) model and do the inference: ```bash - sh local/run_infer_golden.sh + bash local/infer_golden.sh ``` + - Evaluate an existing model ```bash - sh local/run_test.sh + bash local/test.sh ``` - `run_test.sh` will evaluate the model with Word Error Rate (or Character Error Rate) measurement. Similarly, you can also download a well-trained model and test its performance: + `test.sh` will evaluate the model with Word Error Rate (or Character Error Rate) measurement. Similarly, you can also download a well-trained model and test its performance: ```bash - sh local/run_test_golden.sh - ``` \ No newline at end of file + bash local/test_golden.sh + ``` diff --git a/examples/tiny/conf/augmentation.config b/examples/tiny/conf/augmentation.config new file mode 100644 index 000000000..6c24da549 --- /dev/null +++ b/examples/tiny/conf/augmentation.config @@ -0,0 +1,8 @@ +[ + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + } +] diff --git a/examples/tiny/conf/deepspeech2.yaml b/examples/tiny/conf/deepspeech2.yaml new file mode 100644 index 000000000..c7dd83f3c --- /dev/null +++ b/examples/tiny/conf/deepspeech2.yaml @@ -0,0 +1,47 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.tiny + dev_manifest: data/manifest.tiny + test_manifest: data/manifest.tiny + mean_std_filepath: data/mean_std.npz + vocab_filepath: data/vocab.txt + augmentation_config: conf/augmentation.config + batch_size: 4 + max_duration: 27.0 + min_duration: 0.0 + specgram_type: linear + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 20.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 +model: + num_conv_layers: 2 + num_rnn_layers: 3 + rnn_layer_size: 2048 + use_gru: False + share_rnn_weights: True +training: + n_epoch: 20 + lr: 1e-5 + lr_decay: 1.0 + weight_decay: 1e-06 + global_grad_clip: 5.0 +decoding: + batch_size: 128 + error_rate_type: wer + decoding_method: ctc_beam_search + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 2.5 + beta: 0.3 + beam_size: 500 + cutoff_prob: 1.0 + cutoff_top_n: 40 + num_proc_bsearch: 8 diff --git a/examples/tiny/local/run_data.sh b/examples/tiny/local/data.sh similarity index 72% rename from examples/tiny/local/run_data.sh rename to examples/tiny/local/data.sh index 3ad387dbc..d834ec677 100644 --- a/examples/tiny/local/run_data.sh +++ b/examples/tiny/local/data.sh @@ -1,14 +1,13 @@ #! /usr/bin/env bash -# prepare folder -if [ ! -e data ]; then - mkdir data -fi +mkdir -p data +TARGET_DIR=${MAIN_ROOT}/examples/dataset +mkdir -p ${TARGET_DIR} # download data, generate manifests -PYTHONPATH=.:$PYTHONPATH python3 ../librispeech/local/librispeech.py \ +PYTHONPATH=.:$PYTHONPATH python3 ${TARGET_DIR}/librispeech/librispeech.py \ --manifest_prefix="data/manifest" \ ---target_dir="${MAIN_ROOT}/dataset/librispeech" \ +--target_dir="${TARGET_DIR}/librispeech" \ --full_download="False" if [ $? -ne 0 ]; then @@ -19,7 +18,7 @@ fi head -n 64 data/manifest.dev-clean > data/manifest.tiny # build vocabulary -python3 ${MAIN_ROOT}/tools/build_vocab.py \ +python3 ${MAIN_ROOT}/utils/build_vocab.py \ --count_threshold=0 \ --vocab_path="data/vocab.txt" \ --manifest_paths="data/manifest.tiny" @@ -31,7 +30,7 @@ fi # compute mean and stddev for normalizer -python3 ${MAIN_ROOT}/tools/compute_mean_std.py \ +python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.tiny" \ --num_samples=64 \ --specgram_type="linear" \ @@ -42,6 +41,5 @@ if [ $? -ne 0 ]; then exit 1 fi - echo "LibriSpeech Data preparation done." exit 0 diff --git a/examples/tiny/local/download_lm_en.sh b/examples/tiny/local/download_lm_en.sh new file mode 100644 index 000000000..05ea793fb --- /dev/null +++ b/examples/tiny/local/download_lm_en.sh @@ -0,0 +1,20 @@ +#! /usr/bin/env bash + +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/lm +mkdir -p ${DIR} + +URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm +MD5="099a601759d467cd0a8523ff939819c5" +TARGET=${DIR}/common_crawl_00.prune01111.trie.klm + +echo "Download language model ..." +download $URL $MD5 $TARGET +if [ $? -ne 0 ]; then + echo "Fail to download the language model!" + exit 1 +fi + + +exit 0 diff --git a/examples/tiny/local/download_model.sh b/examples/tiny/local/download_model.sh new file mode 100644 index 000000000..f13bde0f2 --- /dev/null +++ b/examples/tiny/local/download_model.sh @@ -0,0 +1,21 @@ +#! /usr/bin/env bash + +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/pretrain +mkdir -p ${DIR} + +URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz' +MD5=fafb11fe57c3ecd107147056453f5348 +TARGET=${DIR}/librispeech_model_fluid.tar.gz + + +echo "Download LibriSpeech model ..." +download $URL $MD5 $TARGET +if [ $? -ne 0 ]; then + echo "Fail to download LibriSpeech model!" + exit 1 +fi +tar -zxvf $TARGET -C ${DIR} + +exit 0 diff --git a/examples/tiny/local/export.sh b/examples/tiny/local/export.sh new file mode 100644 index 000000000..1b5533916 --- /dev/null +++ b/examples/tiny/local/export.sh @@ -0,0 +1,20 @@ +#! /usr/bin/env bash + +if [ $# != 2 ];then + echo "usage: export ckpt_path jit_model_path" + exit -1 +fi + +python3 -u ${BIN_DIR}/export.py \ +--config conf/deepspeech2.yaml \ +--checkpoint_path ${1} \ +--export_path ${2} + + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + + +exit 0 diff --git a/examples/tiny/local/infer.sh b/examples/tiny/local/infer.sh new file mode 100644 index 000000000..3aff6b78b --- /dev/null +++ b/examples/tiny/local/infer.sh @@ -0,0 +1,22 @@ +#! /usr/bin/env bash + +# download language model +bash local/download_lm_en.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +CUDA_VISIBLE_DEVICES=0 \ +python3 -u ${BIN_DIR}/infer.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--output ckpt + + +if [ $? -ne 0 ]; then + echo "Failed in inference!" + exit 1 +fi + +exit 0 diff --git a/examples/tiny/local/run_infer.sh b/examples/tiny/local/run_infer.sh deleted file mode 100644 index bbaa094e9..000000000 --- a/examples/tiny/local/run_infer.sh +++ /dev/null @@ -1,43 +0,0 @@ -#! /usr/bin/env bash - -# download language model -cd $MAIN_ROOT/models/lm > /dev/null -bash download_lm_en.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# infer -CUDA_VISIBLE_DEVICES=0 \ -python3 -u $MAIN_ROOT/infer.py \ ---num_samples=10 \ ---beam_size=500 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---alpha=2.5 \ ---beta=0.3 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---infer_manifest="data/manifest.test-clean" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---model_path="checkpoints/step_final" \ ---lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="wer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in inference!" - exit 1 -fi - - -exit 0 diff --git a/examples/tiny/local/run_infer_golden.sh b/examples/tiny/local/run_infer_golden.sh deleted file mode 100644 index 21663681b..000000000 --- a/examples/tiny/local/run_infer_golden.sh +++ /dev/null @@ -1,52 +0,0 @@ -#! /usr/bin/env bash - -# download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_en.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# download well-trained model -cd ${MAIN_ROOT}/models/librispeech > /dev/null -bash download_model.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# infer -CUDA_VISIBLE_DEVICES=0 \ -python3 -u ${MAIN_ROOT}/infer.py \ ---num_samples=10 \ ---beam_size=500 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---alpha=2.5 \ ---beta=0.3 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---infer_manifest="data/manifest.test-clean" \ ---mean_std_path="${MAIN_ROOT}/models/librispeech/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/librispeech/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/librispeech" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="wer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in inference!" - exit 1 -fi - - -exit 0 diff --git a/examples/tiny/local/run_test.sh b/examples/tiny/local/run_test.sh deleted file mode 100644 index ef1fa5a2d..000000000 --- a/examples/tiny/local/run_test.sh +++ /dev/null @@ -1,43 +0,0 @@ -#! /usr/bin/env bash - -# download language model -cd $MAIN_ROOT/models/lm > /dev/null -bash download_lm_en.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# evaluate model -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u $MAIN_ROOT/test.py \ ---batch_size=128 \ ---beam_size=500 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---alpha=2.5 \ ---beta=0.3 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---test_manifest="data/manifest.test-clean" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---model_path="checkpoints/step_final" \ ---lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="wer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in evaluation!" - exit 1 -fi - - -exit 0 diff --git a/examples/tiny/local/run_test_golden.sh b/examples/tiny/local/run_test_golden.sh deleted file mode 100644 index 9983fade8..000000000 --- a/examples/tiny/local/run_test_golden.sh +++ /dev/null @@ -1,52 +0,0 @@ -#! /usr/bin/env bash - -# download language model -cd $MAIN_ROOT/models/lm > /dev/null -bash download_lm_en.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# download well-trained model -cd $MAIN_ROOT/models/librispeech > /dev/null -bash download_model.sh -if [ $? -ne 0 ]; then - exit 1 -fi -cd - > /dev/null - - -# evaluate model -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u $MAIN_ROOT/test.py \ ---batch_size=128 \ ---beam_size=500 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---alpha=2.5 \ ---beta=0.3 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---test_manifest="data/manifest.test-clean" \ ---mean_std_path="$MAIN_ROOT/models/librispeech/mean_std.npz" \ ---vocab_path="$MAIN_ROOT/models/librispeech/vocab.txt" \ ---model_path="$MAIN_ROOT/models/librispeech" \ ---lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="wer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in evaluation!" - exit 1 -fi - - -exit 0 diff --git a/examples/tiny/local/run_train.sh b/examples/tiny/local/run_train.sh deleted file mode 100644 index de9dcbd74..000000000 --- a/examples/tiny/local/run_train.sh +++ /dev/null @@ -1,40 +0,0 @@ -#! /usr/bin/env bash - -# train model -# if you wish to resume from an exists model, uncomment --init_from_pretrained_model -export FLAGS_sync_nccl_allreduce=0 -CUDA_VISIBLE_DEVICES=0,1,2,3 \ -python3 -u ${MAIN_ROOT}/train.py \ ---batch_size=4 \ ---num_epoch=20 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---num_iter_print=1 \ ---save_epoch=1 \ ---num_samples=64 \ ---learning_rate=1e-5 \ ---max_duration=27.0 \ ---min_duration=0.0 \ ---test_off=False \ ---use_sortagrad=True \ ---use_gru=False \ ---use_gpu=True \ ---is_local=True \ ---share_rnn_weights=True \ ---train_manifest="data/manifest.tiny" \ ---dev_manifest="data/manifest.tiny" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---output_model_dir="./checkpoints/" \ ---augment_conf_path="${MAIN_ROOT}/conf/augmentation.config" \ ---specgram_type="linear" \ ---shuffle_method="batch_shuffle_clipped" \ - -if [ $? -ne 0 ]; then - echo "Failed in training!" - exit 1 -fi - - -exit 0 diff --git a/examples/tiny/local/run_tune.sh b/examples/tiny/local/run_tune.sh deleted file mode 100644 index b5cc4d6a1..000000000 --- a/examples/tiny/local/run_tune.sh +++ /dev/null @@ -1,38 +0,0 @@ -#! /usr/bin/env bash - -# grid-search for hyper-parameters in language model -CUDA_VISIBLE_DEVICES=0,1,2,3 \ -python3 -u $MAIN_ROOT/tools/tune.py \ ---num_batches=-1 \ ---batch_size=128 \ ---beam_size=500 \ ---num_proc_bsearch=12 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---num_alphas=45 \ ---num_betas=8 \ ---alpha_from=1.0 \ ---alpha_to=3.2 \ ---beta_from=0.1 \ ---beta_to=0.45 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---tune_manifest="data/manifest.dev-clean" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---model_path="$MAIN_ROOT/models/librispeech" \ ---lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \ ---error_rate_type="wer" \ ---specgram_type="linear" - -if [ $? -ne 0 ]; then - echo "Failed in tuning!" - exit 1 -fi - - -exit 0 diff --git a/examples/tiny/local/test.sh b/examples/tiny/local/test.sh new file mode 100644 index 000000000..fedebf96d --- /dev/null +++ b/examples/tiny/local/test.sh @@ -0,0 +1,23 @@ +#! /usr/bin/env bash + +# download language model +bash local/download_lm_en.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +CUDA_VISIBLE_DEVICES=0 \ +python3 -u ${BIN_DIR}/test.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--output ckpt + + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + + +exit 0 diff --git a/examples/tiny/local/train.sh b/examples/tiny/local/train.sh new file mode 100644 index 000000000..369ccc924 --- /dev/null +++ b/examples/tiny/local/train.sh @@ -0,0 +1,18 @@ +#! /usr/bin/env bash + +export FLAGS_sync_nccl_allreduce=0 + +CUDA_VISIBLE_DEVICES=0 \ +python3 -u ${BIN_DIR}/train.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--output ckpt + +if [ $? -ne 0 ]; then + echo "Failed in training!" + exit 1 +fi + + +exit 0 diff --git a/examples/tiny/local/tune.sh b/examples/tiny/local/tune.sh new file mode 100644 index 000000000..4bb81d29b --- /dev/null +++ b/examples/tiny/local/tune.sh @@ -0,0 +1,33 @@ +#! /usr/bin/env bash + +if [ $# != 1 ];then + echo "usage: tune ckpt_path" + exit 1 +fi + +# grid-search for hyper-parameters in language model +python3 -u ${BIN_DIR}/tune.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--num_batches=-1 \ +--batch_size=128 \ +--beam_size=500 \ +--num_proc_bsearch=12 \ +--num_alphas=45 \ +--num_betas=8 \ +--alpha_from=1.0 \ +--alpha_to=3.2 \ +--beta_from=0.1 \ +--beta_to=0.45 \ +--cutoff_prob=1.0 \ +--cutoff_top_n=40 \ +--checkpoint_path ${1} + +if [ $? -ne 0 ]; then + echo "Failed in tuning!" + exit 1 +fi + + +exit 0 diff --git a/examples/tiny/path.sh b/examples/tiny/path.sh index fd1cebba8..a179631b3 100644 --- a/examples/tiny/path.sh +++ b/examples/tiny/path.sh @@ -6,3 +6,9 @@ export LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + + +MODEL=deepspeech2 +export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin diff --git a/examples/tiny/run.sh b/examples/tiny/run.sh index c8e589139..2b5ed5308 100644 --- a/examples/tiny/run.sh +++ b/examples/tiny/run.sh @@ -1,24 +1,16 @@ #!/bin/bash +set -e source path.sh # prepare data -bash ./local/run_data.sh - -# test pretrain model -bash ./local/run_test_golden.sh - -# test pretain model -bash ./local/run_infer_golden.sh +bash ./local/data.sh # train model -bash ./local/run_train.sh +bash ./local/train.sh # test model -bash ./local/run_test.sh +bash ./local/test.sh # infer model -bash ./local/run_infer.sh - -# tune model -bash ./local/run_tune.sh +bash ./local/infer.sh diff --git a/infer.py b/infer.py deleted file mode 100644 index ffcb48eb6..000000000 --- a/infer.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inferer for DeepSpeech2 model.""" - -import sys -import argparse -import functools -import paddle.fluid as fluid -from data_utils.data import DataGenerator -from model_utils.model import DeepSpeech2Model -from model_utils.model_check import check_cuda, check_version -from utils.error_rate import wer, cer -from utils.utility import add_arguments, print_arguments - -parser = argparse.ArgumentParser(description=__doc__) -add_arg = functools.partial(add_arguments, argparser=parser) -# yapf: disable -add_arg('num_samples', int, 10, "# of samples to infer.") -add_arg('beam_size', int, 500, "Beam search width.") -add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.") -add_arg('num_conv_layers', int, 2, "# of convolution layers.") -add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") -add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") -add_arg('alpha', float, 2.5, "Coef of LM for beam search.") -add_arg('beta', float, 0.3, "Coef of WC for beam search.") -add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.") -add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.") -add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") -add_arg('use_gpu', bool, True, "Use GPU or not.") -add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " - "bi-directional RNNs. Not for GRU.") -add_arg('infer_manifest', str, - 'data/librispeech/manifest.dev-clean', - "Filepath of manifest to infer.") -add_arg('mean_std_path', str, - 'data/librispeech/mean_std.npz', - "Filepath of normalizer's mean & std.") -add_arg('vocab_path', str, - 'data/librispeech/vocab.txt', - "Filepath of vocabulary.") -add_arg('lang_model_path', str, - 'models/lm/common_crawl_00.prune01111.trie.klm', - "Filepath for language model.") -add_arg('model_path', str, - './checkpoints/libri/step_final', - "If None, the training starts from scratch, " - "otherwise, it resumes from the pre-trained model.") -add_arg('decoding_method', str, - 'ctc_beam_search', - "Decoding method. Options: ctc_beam_search, ctc_greedy", - choices = ['ctc_beam_search', 'ctc_greedy']) -add_arg('error_rate_type', str, - 'wer', - "Error rate type for evaluation.", - choices=['wer', 'cer']) -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -# yapf: disable -args = parser.parse_args() - - -def infer(): - """Inference for DeepSpeech2.""" - - # check if set use_gpu=True in paddlepaddle cpu version - check_cuda(args.use_gpu) - # check if paddlepaddle version is satisfied - check_version() - - if args.use_gpu: - place = fluid.CUDAPlace(0) - else: - place = fluid.CPUPlace() - - data_generator = DataGenerator( - vocab_filepath=args.vocab_path, - mean_std_filepath=args.mean_std_path, - augmentation_config='{}', - specgram_type=args.specgram_type, - keep_transcription_text=True, - place = place, - is_training = False) - batch_reader = data_generator.batch_reader_creator( - manifest_path=args.infer_manifest, - batch_size=args.num_samples, - sortagrad=False, - shuffle_method=None) - infer_data = next(batch_reader()) - - ds2_model = DeepSpeech2Model( - vocab_size=data_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_layer_size=args.rnn_layer_size, - use_gru=args.use_gru, - share_rnn_weights=args.share_rnn_weights, - place=place, - init_from_pretrained_model=args.model_path) - - # decoders only accept string encoded in utf-8 - vocab_list = [chars for chars in data_generator.vocab_list] - - if args.decoding_method == "ctc_greedy": - ds2_model.logger.info("start inference ...") - probs_split = ds2_model.infer_batch_probs( - infer_data=infer_data, - feeding_dict=data_generator.feeding) - - result_transcripts = ds2_model.decode_batch_greedy( - probs_split=probs_split, - vocab_list=vocab_list) - else: - ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, - vocab_list) - ds2_model.logger.info("start inference ...") - probs_split= ds2_model.infer_batch_probs( - infer_data=infer_data, - feeding_dict=data_generator.feeding) - - result_transcripts= ds2_model.decode_batch_beam_search( - probs_split=probs_split, - beam_alpha=args.alpha, - beam_beta=args.beta, - beam_size=args.beam_size, - cutoff_prob=args.cutoff_prob, - cutoff_top_n=args.cutoff_top_n, - vocab_list=vocab_list, - num_processes=args.num_proc_bsearch) - - error_rate_func = cer if args.error_rate_type == 'cer' else wer - target_transcripts = infer_data[1] - for target, result in zip(target_transcripts, result_transcripts): - print("\nTarget Transcription: %s\nOutput Transcription: %s" % - (target, result)) - print("Current error rate [%s] = %f" % - (args.error_rate_type, error_rate_func(target, result))) - - ds2_model.logger.info("finish inference") - -def main(): - print_arguments(args) - infer() - - -if __name__ == '__main__': - main() diff --git a/model_utils/model.py b/model_utils/model.py deleted file mode 100644 index f4555bd69..000000000 --- a/model_utils/model.py +++ /dev/null @@ -1,573 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Contains DeepSpeech2 model.""" - -import sys -import os -import time -import logging -import gzip -import copy -import inspect -import collections -import multiprocessing -import numpy as np -from distutils.dir_util import mkpath -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -from decoders.swig_wrapper import Scorer -from decoders.swig_wrapper import ctc_greedy_decoder -from decoders.swig_wrapper import ctc_beam_search_decoder_batch -from model_utils.network import deep_speech_v2_network - -logging.basicConfig( - format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s') - - -class DeepSpeech2Model(object): - """DeepSpeech2Model class. - - :param vocab_size: Decoding vocabulary size. - :type vocab_size: int - :param num_conv_layers: Number of stacking convolution layers. - :type num_conv_layers: int - :param num_rnn_layers: Number of stacking RNN layers. - :type num_rnn_layers: int - :param rnn_layer_size: RNN layer size (number of RNN cells). - :type rnn_layer_size: int - :param use_gru: Use gru if set True. Use simple rnn if set False. - :type use_gru: bool - :param share_rnn_weights: Whether to share input-hidden weights between - forward and backward directional RNNs.Notice that - for GRU, weight sharing is not supported. - :type share_rnn_weights: bool - :param place: Program running place. - :type place: CPUPlace or CUDAPlace - :param init_from_pretrained_model: Pretrained model path. If None, will train - from stratch. - :type init_from_pretrained_model: string|None - :param output_model_dir: Output model directory. If None, output to current directory. - :type output_model_dir: string|None - """ - - def __init__(self, - vocab_size, - num_conv_layers, - num_rnn_layers, - rnn_layer_size, - use_gru=False, - share_rnn_weights=True, - place=fluid.CPUPlace(), - init_from_pretrained_model=None, - output_model_dir=None): - self._vocab_size = vocab_size - self._num_conv_layers = num_conv_layers - self._num_rnn_layers = num_rnn_layers - self._rnn_layer_size = rnn_layer_size - self._use_gru = use_gru - self._share_rnn_weights = share_rnn_weights - self._place = place - self._init_from_pretrained_model = init_from_pretrained_model - self._output_model_dir = output_model_dir - self._ext_scorer = None - self.logger = logging.getLogger("") - self.logger.setLevel(level=logging.INFO) - - def create_network(self, is_infer=False): - """Create data layers and model network. - :param is_training: Whether to create a network for training. - :type is_training: bool - :return reader: Reader for input. - :rtype reader: read generater - :return log_probs: An output unnormalized log probability layer. - :rtype lig_probs: Varable - :return loss: A ctc loss layer. - :rtype loss: Variable - """ - - if not is_infer: - input_fields = { - 'names': ['audio_data', 'text_data', 'seq_len_data', 'masks'], - 'shapes': - [[None, 161, None], [None, 1], [None, 1], [None, 32, 81, None]], - 'dtypes': ['float32', 'int32', 'int64', 'float32'], - 'lod_levels': [0, 1, 0, 0] - } - - inputs = [ - fluid.data( - name=input_fields['names'][i], - shape=input_fields['shapes'][i], - dtype=input_fields['dtypes'][i], - lod_level=input_fields['lod_levels'][i]) - for i in range(len(input_fields['names'])) - ] - - reader = fluid.io.DataLoader.from_generator( - feed_list=inputs, - capacity=64, - iterable=False, - use_double_buffer=True) - - (audio_data, text_data, seq_len_data, masks) = inputs - else: - audio_data = fluid.data( - name='audio_data', - shape=[None, 161, None], - dtype='float32', - lod_level=0) - seq_len_data = fluid.data( - name='seq_len_data', - shape=[None, 1], - dtype='int64', - lod_level=0) - masks = fluid.data( - name='masks', - shape=[None, 32, 81, None], - dtype='float32', - lod_level=0) - text_data = None - reader = fluid.DataFeeder([audio_data, seq_len_data, masks], - self._place) - - log_probs, loss = deep_speech_v2_network( - audio_data=audio_data, - text_data=text_data, - seq_len_data=seq_len_data, - masks=masks, - dict_size=self._vocab_size, - num_conv_layers=self._num_conv_layers, - num_rnn_layers=self._num_rnn_layers, - rnn_size=self._rnn_layer_size, - use_gru=self._use_gru, - share_rnn_weights=self._share_rnn_weights) - return reader, log_probs, loss - - def init_from_pretrained_model(self, exe, program): - '''Init params from pretrain model. ''' - - assert isinstance(self._init_from_pretrained_model, str) - - if not os.path.exists(self._init_from_pretrained_model): - print(self._init_from_pretrained_model) - raise Warning("The pretrained params do not exist.") - return False - fluid.io.load_params( - exe, - self._init_from_pretrained_model, - main_program=program, - filename="params.pdparams") - - print("finish initing model from pretrained params from %s" % - (self._init_from_pretrained_model)) - - pre_epoch = 0 - dir_name = self._init_from_pretrained_model.split('_') - if len(dir_name) >= 2 and dir_name[-2].endswith('epoch') and dir_name[ - -1].isdigit(): - pre_epoch = int(dir_name[-1]) - - return pre_epoch + 1 - - def save_param(self, exe, program, dirname): - '''Save model params to dirname''' - - assert isinstance(self._output_model_dir, str) - - param_dir = os.path.join(self._output_model_dir) - - if not os.path.exists(param_dir): - os.mkdir(param_dir) - - fluid.io.save_params( - exe, - os.path.join(param_dir, dirname), - main_program=program, - filename="params.pdparams") - print("save parameters at %s" % (os.path.join(param_dir, dirname))) - - return True - - def test(self, exe, dev_batch_reader, test_program, test_reader, - fetch_list): - '''Test the model. - - :param exe:The executor of program. - :type exe: Executor - :param dev_batch_reader: The reader of test dataa. - :type dev_batch_reader: read generator - :param test_program: The program of test. - :type test_program: Program - :param test_reader: Reader of test. - :type test_reader: Reader - :param fetch_list: Fetch list. - :type fetch_list: list - :return: An output unnormalized log probability. - :rtype: array - ''' - test_reader.start() - epoch_loss = [] - while True: - try: - each_loss = exe.run( - program=test_program, - fetch_list=fetch_list, - return_numpy=False) - epoch_loss.extend(np.array(each_loss[0])) - - except fluid.core.EOFException: - test_reader.reset() - break - return np.mean(np.array(epoch_loss)) - - def train(self, - train_batch_reader, - dev_batch_reader, - feeding_dict, - learning_rate, - gradient_clipping, - num_epoch, - batch_size, - num_samples, - save_epoch=100, - num_iterations_print=100, - test_off=False): - """Train the model. - - :param train_batch_reader: Train data reader. - :type train_batch_reader: callable - :param dev_batch_reader: Validation data reader. - :type dev_batch_reader: callable - :param feeding_dict: Feeding is a map of field name and tuple index - of the data that reader returns. - :type feeding_dict: dict|list - :param learning_rate: Learning rate for ADAM optimizer. - :type learning_rate: float - :param gradient_clipping: Gradient clipping threshold. - :type gradient_clipping: float - :param num_epoch: Number of training epochs. - :type num_epoch: int - :param batch_size: Number of batch size. - :type batch_size: int - :param num_samples: The num of train samples. - :type num_samples: int - :param save_epoch: Number of training iterations for save checkpoint and params. - :type save_epoch: int - :param num_iterations_print: Number of training iterations for printing - a training loss. - :type num_iteratons_print: int - :param test_off: Turn off testing. - :type test_off: bool - """ - # prepare model output directory - if not os.path.exists(self._output_model_dir): - mkpath(self._output_model_dir) - - # adapt the feeding dict according to the network - adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict) - - if isinstance(self._place, fluid.CUDAPlace): - dev_count = fluid.core.get_cuda_device_count() - else: - dev_count = int(os.environ.get('CPU_NUM', 1)) - - # prepare the network - train_program = fluid.Program() - startup_prog = fluid.Program() - with fluid.program_guard(train_program, startup_prog): - with fluid.unique_name.guard(): - train_reader, log_probs, ctc_loss = self.create_network() - # prepare optimizer - optimizer = fluid.optimizer.AdamOptimizer( - learning_rate=fluid.layers.exponential_decay( - learning_rate=learning_rate, - decay_steps=num_samples / batch_size / dev_count, - decay_rate=0.83, - staircase=True), - grad_clip=fluid.clip.GradientClipByGlobalNorm( - clip_norm=gradient_clipping)) - optimizer.minimize(loss=ctc_loss) - - test_prog = fluid.Program() - with fluid.program_guard(test_prog, startup_prog): - with fluid.unique_name.guard(): - test_reader, _, ctc_loss = self.create_network() - - test_prog = test_prog.clone(for_test=True) - - exe = fluid.Executor(self._place) - exe.run(startup_prog) - - # init from some pretrain models, to better solve the current task - pre_epoch = 0 - if self._init_from_pretrained_model: - pre_epoch = self.init_from_pretrained_model(exe, train_program) - - build_strategy = compiler.BuildStrategy() - exec_strategy = fluid.ExecutionStrategy() - - # pass the build_strategy to with_data_parallel API - compiled_prog = compiler.CompiledProgram( - train_program).with_data_parallel( - loss_name=ctc_loss.name, - build_strategy=build_strategy, - exec_strategy=exec_strategy) - - train_reader.set_batch_generator(train_batch_reader) - test_reader.set_batch_generator(dev_batch_reader) - - # run train - for epoch_id in range(num_epoch): - train_reader.start() - epoch_loss = [] - time_begin = time.time() - batch_id = 0 - step = 0 - while True: - try: - fetch_list = [ctc_loss.name] - - if batch_id % num_iterations_print == 0: - fetch = exe.run( - program=compiled_prog, - fetch_list=fetch_list, - return_numpy=False) - each_loss = fetch[0] - epoch_loss.extend(np.array(each_loss[0]) / batch_size) - - print("epoch: %d, batch: %d, train loss: %f\n" % - (epoch_id, batch_id, - np.mean(each_loss[0]) / batch_size)) - - else: - each_loss = exe.run( - program=compiled_prog, - fetch_list=[], - return_numpy=False) - - batch_id = batch_id + 1 - except fluid.core.EOFException: - train_reader.reset() - break - time_end = time.time() - used_time = time_end - time_begin - if test_off: - print("\n--------Time: %f sec, epoch: %d, train loss: %f\n" % - (used_time, epoch_id, np.mean(np.array(epoch_loss)))) - else: - print('\n----------Begin test...') - test_loss = self.test( - exe, - dev_batch_reader=dev_batch_reader, - test_program=test_prog, - test_reader=test_reader, - fetch_list=[ctc_loss]) - print( - "--------Time: %f sec, epoch: %d, train loss: %f, test loss: %f" - % (used_time, epoch_id + pre_epoch, - np.mean(np.array(epoch_loss)), test_loss / batch_size)) - if (epoch_id + 1) % save_epoch == 0: - self.save_param(exe, train_program, - "epoch_" + str(epoch_id + pre_epoch)) - - self.save_param(exe, train_program, "step_final") - - print("\n------------Training finished!!!-------------") - - def infer_batch_probs(self, infer_data, feeding_dict): - """Infer the prob matrices for a batch of speech utterances. - :param infer_data: List of utterances to infer, with each utterance - consisting of a tuple of audio features and - transcription text (empty string). - :type infer_data: list - :param feeding_dict: Feeding is a map of field name and tuple index - of the data that reader returns. - :type feeding_dict: dict|list - :return: List of 2-D probability matrix, and each consists of prob - vectors for one speech utterancce. - :rtype: List of matrix - """ - # define inferer - infer_program = fluid.Program() - startup_prog = fluid.Program() - - # adapt the feeding dict according to the network - adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict) - - # prepare the network - with fluid.program_guard(infer_program, startup_prog): - with fluid.unique_name.guard(): - feeder, log_probs, _ = self.create_network(is_infer=True) - - infer_program = infer_program.clone(for_test=True) - exe = fluid.Executor(self._place) - exe.run(startup_prog) - - # init param from pretrained_model - if not self._init_from_pretrained_model: - exit("No pretrain model file path!") - self.init_from_pretrained_model(exe, infer_program) - - infer_results = [] - time_begin = time.time() - - # run inference - for i in range(infer_data[0].shape[0]): - each_log_probs = exe.run( - program=infer_program, - feed=feeder.feed( - [[infer_data[0][i], infer_data[2][i], infer_data[3][i]]]), - fetch_list=[log_probs], - return_numpy=False) - infer_results.extend(np.array(each_log_probs[0])) - - # slice result - infer_results = np.array(infer_results) - seq_len = (infer_data[2] - 1) // 3 + 1 - - start_pos = [0] * (infer_data[0].shape[0] + 1) - for i in range(infer_data[0].shape[0]): - start_pos[i + 1] = start_pos[i] + seq_len[i][0] - probs_split = [ - infer_results[start_pos[i]:start_pos[i + 1]] - for i in range(0, infer_data[0].shape[0]) - ] - - return probs_split - - def decode_batch_greedy(self, probs_split, vocab_list): - """Decode by best path for a batch of probs matrix input. - :param probs_split: List of 2-D probability matrix, and each consists - of prob vectors for one speech utterancce. - :param probs_split: List of matrix - :param vocab_list: List of tokens in the vocabulary, for decoding. - :type vocab_list: list - :return: List of transcription texts. - :rtype: List of str - """ - results = [] - for i, probs in enumerate(probs_split): - output_transcription = ctc_greedy_decoder( - probs_seq=probs, vocabulary=vocab_list) - results.append(output_transcription) - print(results) - return results - - def init_ext_scorer(self, beam_alpha, beam_beta, language_model_path, - vocab_list): - """Initialize the external scorer. - :param beam_alpha: Parameter associated with language model. - :type beam_alpha: float - :param beam_beta: Parameter associated with word count. - :type beam_beta: float - :param language_model_path: Filepath for language model. If it is - empty, the external scorer will be set to - None, and the decoding method will be pure - beam search without scorer. - :type language_model_path: str|None - :param vocab_list: List of tokens in the vocabulary, for decoding. - :type vocab_list: list - """ - if language_model_path != '': - self.logger.info("begin to initialize the external scorer " - "for decoding") - self._ext_scorer = Scorer(beam_alpha, beam_beta, - language_model_path, vocab_list) - lm_char_based = self._ext_scorer.is_character_based() - lm_max_order = self._ext_scorer.get_max_order() - lm_dict_size = self._ext_scorer.get_dict_size() - self.logger.info("language model: " - "is_character_based = %d," % lm_char_based + - " max_order = %d," % lm_max_order + - " dict_size = %d" % lm_dict_size) - self.logger.info("end initializing scorer") - else: - self._ext_scorer = None - self.logger.info("no language model provided, " - "decoding by pure beam search without scorer.") - - def decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta, - beam_size, cutoff_prob, cutoff_top_n, - vocab_list, num_processes): - """Decode by beam search for a batch of probs matrix input. - :param probs_split: List of 2-D probability matrix, and each consists - of prob vectors for one speech utterancce. - :param probs_split: List of matrix - :param beam_alpha: Parameter associated with language model. - :type beam_alpha: float - :param beam_beta: Parameter associated with word count. - :type beam_beta: float - :param beam_size: Width for Beam search. - :type beam_size: int - :param cutoff_prob: Cutoff probability in pruning, - default 1.0, no pruning. - :type cutoff_prob: float - :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n - characters with highest probs in vocabulary will be - used in beam search, default 40. - :type cutoff_top_n: int - :param vocab_list: List of tokens in the vocabulary, for decoding. - :type vocab_list: list - :param num_processes: Number of processes (CPU) for decoder. - :type num_processes: int - :return: List of transcription texts. - :rtype: List of str - """ - if self._ext_scorer != None: - self._ext_scorer.reset_params(beam_alpha, beam_beta) - # beam search decode - num_processes = min(num_processes, len(probs_split)) - beam_search_results = ctc_beam_search_decoder_batch( - probs_split=probs_split, - vocabulary=vocab_list, - beam_size=beam_size, - num_processes=num_processes, - ext_scoring_func=self._ext_scorer, - cutoff_prob=cutoff_prob, - cutoff_top_n=cutoff_top_n) - - results = [result[0][1] for result in beam_search_results] - return results - - def _adapt_feeding_dict(self, feeding_dict): - """Adapt feeding dict according to network struct. - - To remove impacts from padding part, we add scale_sub_region layer and - sub_seq layer. For sub_seq layer, 'sequence_offset' and - 'sequence_length' fields are appended. For each scale_sub_region layer - 'convN_index_range' field is appended. - - :param feeding_dict: Feeding is a map of field name and tuple index - of the data that reader returns. - :type feeding_dict: dict|list - :return: Adapted feeding dict. - :rtype: dict|list - """ - adapted_feeding_dict = copy.deepcopy(feeding_dict) - if isinstance(feeding_dict, dict): - adapted_feeding_dict["sequence_offset"] = len(adapted_feeding_dict) - adapted_feeding_dict["sequence_length"] = len(adapted_feeding_dict) - for i in range(self._num_conv_layers): - adapted_feeding_dict["conv%d_index_range" %i] = \ - len(adapted_feeding_dict) - elif isinstance(feeding_dict, list): - adapted_feeding_dict.append("sequence_offset") - adapted_feeding_dict.append("sequence_length") - for i in range(self._num_conv_layers): - adapted_feeding_dict.append("conv%d_index_range" % i) - else: - raise ValueError("Type of feeding_dict is %s, not supported." % - type(feeding_dict)) - - return adapted_feeding_dict diff --git a/model_utils/model_check.py b/model_utils/model_check.py deleted file mode 100644 index bf2c424fd..000000000 --- a/model_utils/model_check.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import paddle -import paddle.fluid as fluid - - -def check_cuda(use_cuda, err = \ - "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \ - Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n" - ): - """ - Log error and exit when set use_gpu=true in paddlepaddle - cpu version. - """ - try: - if use_cuda == True and fluid.is_compiled_with_cuda() == False: - print(err) - sys.exit(1) - except Exception as e: - pass - - -def check_version(): - """ - Log error and exit when the installed version of paddlepaddle is - not satisfied. - """ - err = "PaddlePaddle version 1.6 or higher is required, " \ - "or a suitable develop version is satisfied as well. \n" \ - "Please make sure the version is good with your code." \ - - try: - fluid.require_version('1.6.0') - except Exception as e: - print(err) - sys.exit(1) diff --git a/model_utils/network.py b/model_utils/network.py deleted file mode 100644 index 19f9d887c..000000000 --- a/model_utils/network.py +++ /dev/null @@ -1,458 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections -import paddle.fluid as fluid -import numpy as np - - -def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, - padding, act, masks, name): - """Convolution layer with batch normalization. - - :param input: Input layer. - :type input: Variable - :param filter_size: The x dimension of a filter kernel. Or input a tuple for - two image dimension. - :type filter_size: int|tuple|list - :param num_channels_in: Number of input channels. - :type num_channels_in: int - :param num_channels_out: Number of output channels. - :type num_channels_out: int - :param stride: The x dimension of the stride. Or input a tuple for two - image dimension. - :type stride: int|tuple|list - :param padding: The x dimension of the padding. Or input a tuple for two - image dimension. - :type padding: int|tuple|list - :param act: Activation type. - :type act: string - :param masks: Masks data layer to reset padding. - :type masks: Variable - :param name: Name of the layer. - :param name: string - :return: Batch norm layer after convolution layer. - :rtype: Variable - - """ - conv_layer = fluid.layers.conv2d( - input=input, - num_filters=num_channels_out, - filter_size=filter_size, - stride=stride, - padding=padding, - param_attr=fluid.ParamAttr(name=name + '_conv2d_weight'), - act=None, - bias_attr=False) - - batch_norm = fluid.layers.batch_norm( - input=conv_layer, - act=act, - param_attr=fluid.ParamAttr(name=name + '_batch_norm_weight'), - bias_attr=fluid.ParamAttr(name=name + '_batch_norm_bias'), - moving_mean_name=name + '_batch_norm_moving_mean', - moving_variance_name=name + '_batch_norm_moving_variance') - - # reset padding part to 0 - padding_reset = fluid.layers.elementwise_mul(batch_norm, masks) - return padding_reset - - -class RNNCell(fluid.layers.RNNCell): - """A simple rnn cell.""" - - def __init__(self, - hidden_size, - param_attr=None, - bias_attr=None, - hidden_activation=None, - activation=None, - dtype="float32", - name="RNNCell"): - """Initialize simple rnn cell. - - :param hidden_size: Dimension of RNN cells. - :type hidden_size: int - :param param_attr: Parameter properties of hidden layer weights that - can be learned - :type param_attr: ParamAttr - :param bias_attr: Bias properties of hidden layer weights that can be learned - :type bias_attr: ParamAttr - :param hidden_activation: Activation for hidden cell - :type hidden_activation: Activation - :param activation: Activation for output - :type activation: Activation - :param name: Name of cell - :type name: string - """ - - self.hidden_size = hidden_size - self.param_attr = param_attr - self.bias_attr = bias_attr - self.hidden_activation = hidden_activation - self.activation = activation or fluid.layers.brelu - self.name = name - - def call(self, inputs, states): - new_hidden = fluid.layers.fc( - input=states, - size=self.hidden_size, - act=self.hidden_activation, - param_attr=self.param_attr, - bias_attr=self.bias_attr) - new_hidden = fluid.layers.elementwise_add(new_hidden, inputs) - new_hidden = self.activation(new_hidden) - - return new_hidden, new_hidden - - @property - def state_shape(self): - return [self.hidden_size] - - -def bidirectional_simple_rnn_bn_layer(name, input, size, share_weights): - """Bidirectonal simple rnn layer with sequence-wise batch normalization. - The batch normalization is only performed on input-state weights. - - :param name: Name of the layer parameters. - :type name: string - :param input: Input layer. - :type input: Variable - :param size: Dimension of RNN cells. - :type size: int - :param share_weights: Whether to share input-hidden weights between - forward and backward directional RNNs. - :type share_weights: bool - :return: Bidirectional simple rnn layer. - :rtype: Variable - """ - forward_cell = RNNCell( - hidden_size=size, - activation=fluid.layers.brelu, - param_attr=fluid.ParamAttr(name=name + '_forward_rnn_weight'), - bias_attr=fluid.ParamAttr(name=name + '_forward_rnn_bias')) - - reverse_cell = RNNCell( - hidden_size=size, - activation=fluid.layers.brelu, - param_attr=fluid.ParamAttr(name=name + '_reverse_rnn_weight'), - bias_attr=fluid.ParamAttr(name=name + '_reverse_rnn_bias')) - - pad_value = fluid.layers.assign(input=np.array([0.0], dtype=np.float32)) - - if share_weights: - #input-hidden weights shared between bi-directional rnn. - input_proj = fluid.layers.fc( - input=input, - size=size, - act=None, - param_attr=fluid.ParamAttr(name=name + '_fc_weight'), - bias_attr=False) - - # batch norm is only performed on input-state projection - input_proj_bn_forward = fluid.layers.batch_norm( - input=input_proj, - act=None, - param_attr=fluid.ParamAttr(name=name + '_batch_norm_weight'), - bias_attr=fluid.ParamAttr(name=name + '_batch_norm_bias'), - moving_mean_name=name + '_batch_norm_moving_mean', - moving_variance_name=name + '_batch_norm_moving_variance') - input_proj_bn_reverse = input_proj_bn_forward - else: - input_proj_forward = fluid.layers.fc( - input=input, - size=size, - act=None, - param_attr=fluid.ParamAttr(name=name + '_forward_fc_weight'), - bias_attr=False) - input_proj_reverse = fluid.layers.fc( - input=input, - size=size, - act=None, - param_attr=fluid.ParamAttr(name=name + '_reverse_fc_weight'), - bias_attr=False) - #batch norm is only performed on input-state projection - input_proj_bn_forward = fluid.layers.batch_norm( - input=input_proj_forward, - act=None, - param_attr=fluid.ParamAttr( - name=name + '_forward_batch_norm_weight'), - bias_attr=fluid.ParamAttr(name=name + '_forward_batch_norm_bias'), - moving_mean_name=name + '_forward_batch_norm_moving_mean', - moving_variance_name=name + '_forward_batch_norm_moving_variance') - input_proj_bn_reverse = fluid.layers.batch_norm( - input=input_proj_reverse, - act=None, - param_attr=fluid.ParamAttr( - name=name + '_reverse_batch_norm_weight'), - bias_attr=fluid.ParamAttr(name=name + '_reverse_batch_norm_bias'), - moving_mean_name=name + '_reverse_batch_norm_moving_mean', - moving_variance_name=name + '_reverse_batch_norm_moving_variance') - # forward and backward in time - input, length = fluid.layers.sequence_pad(input_proj_bn_forward, pad_value) - forward_rnn, _ = fluid.layers.rnn( - cell=forward_cell, inputs=input, time_major=False, is_reverse=False) - forward_rnn = fluid.layers.sequence_unpad(x=forward_rnn, length=length) - - input, length = fluid.layers.sequence_pad(input_proj_bn_reverse, pad_value) - reverse_rnn, _ = fluid.layers.rnn( - cell=reverse_cell, - inputs=input, - sequence_length=length, - time_major=False, - is_reverse=True) - reverse_rnn = fluid.layers.sequence_unpad(x=reverse_rnn, length=length) - - out = fluid.layers.concat(input=[forward_rnn, reverse_rnn], axis=1) - return out - - -def bidirectional_gru_bn_layer(name, input, size, act): - """Bidirectonal gru layer with sequence-wise batch normalization. - The batch normalization is only performed on input-state weights. - - :param name: Name of the layer. - :type name: string - :param input: Input layer. - :type input: Variable - :param size: Dimension of GRU cells. - :type size: int - :param act: Activation type. - :type act: string - :return: Bidirectional GRU layer. - :rtype: Variable - """ - input_proj_forward = fluid.layers.fc( - input=input, - size=size * 3, - act=None, - param_attr=fluid.ParamAttr(name=name + '_forward_fc_weight'), - bias_attr=False) - input_proj_reverse = fluid.layers.fc( - input=input, - size=size * 3, - act=None, - param_attr=fluid.ParamAttr(name=name + '_reverse_fc_weight'), - bias_attr=False) - #batch norm is only performed on input-related prohections - input_proj_bn_forward = fluid.layers.batch_norm( - input=input_proj_forward, - act=None, - param_attr=fluid.ParamAttr(name=name + '_forward_batch_norm_weight'), - bias_attr=fluid.ParamAttr(name=name + '_forward_batch_norm_bias'), - moving_mean_name=name + '_forward_batch_norm_moving_mean', - moving_variance_name=name + '_forward_batch_norm_moving_variance') - input_proj_bn_reverse = fluid.layers.batch_norm( - input=input_proj_reverse, - act=None, - param_attr=fluid.ParamAttr(name=name + '_reverse_batch_norm_weight'), - bias_attr=fluid.ParamAttr(name=name + '_reverse_batch_norm_bias'), - moving_mean_name=name + '_reverse_batch_norm_moving_mean', - moving_variance_name=name + '_reverse_batch_norm_moving_variance') - #forward and backward in time - forward_gru = fluid.layers.dynamic_gru( - input=input_proj_bn_forward, - size=size, - gate_activation='sigmoid', - candidate_activation=act, - param_attr=fluid.ParamAttr(name=name + '_forward_gru_weight'), - bias_attr=fluid.ParamAttr(name=name + '_forward_gru_bias'), - is_reverse=False) - reverse_gru = fluid.layers.dynamic_gru( - input=input_proj_bn_reverse, - size=size, - gate_activation='sigmoid', - candidate_activation=act, - param_attr=fluid.ParamAttr(name=name + '_reverse_gru_weight'), - bias_attr=fluid.ParamAttr(name=name + '_reverse_gru_bias'), - is_reverse=True) - return fluid.layers.concat(input=[forward_gru, reverse_gru], axis=1) - - -def conv_group(input, num_stacks, seq_len_data, masks): - """Convolution group with stacked convolution layers. - - :param input: Input layer. - :type input: Variable - :param num_stacks: Number of stacked convolution layers. - :type num_stacks: int - :param seq_len_data:Valid sequence length data layer. - :type seq_len_data:Variable - :param masks: Masks data layer to reset padding. - :type masks: Variable - :return: Output layer of the convolution group. - :rtype: Variable - """ - filter_size = (41, 11) - stride = (2, 3) - padding = (20, 5) - conv = conv_bn_layer( - input=input, - filter_size=filter_size, - num_channels_in=1, - num_channels_out=32, - stride=stride, - padding=padding, - act="brelu", - masks=masks, - name='layer_0', ) - - seq_len_data = (np.array(seq_len_data) - filter_size[1] + 2 * padding[1] - ) // stride[1] + 1 - - output_height = (161 - 1) // 2 + 1 - - for i in range(num_stacks - 1): - #reshape masks - output_height = (output_height - 1) // 2 + 1 - masks = fluid.layers.slice( - masks, axes=[2], starts=[0], ends=[output_height]) - conv = conv_bn_layer( - input=conv, - filter_size=(21, 11), - num_channels_in=32, - num_channels_out=32, - stride=(2, 1), - padding=(10, 5), - act="brelu", - masks=masks, - name='layer_{}'.format(i + 1), ) - - output_num_channels = 32 - return conv, output_num_channels, output_height, seq_len_data - - -def rnn_group(input, size, num_stacks, num_conv_layers, use_gru, - share_rnn_weights): - """RNN group with stacked bidirectional simple RNN or GRU layers. - - :param input: Input layer. - :type input: Variable - :param size: Dimension of RNN cells in each layer. - :type size: int - :param num_stacks: Number of stacked rnn layers. - :type num_stacks: int - :param use_gru: Use gru if set True. Use simple rnn if set False. - :type use_gru: bool - :param share_rnn_weights: Whether to share input-hidden weights between - forward and backward directional RNNs. - It is only available when use_gru=False. - :type share_weights: bool - :return: Output layer of the RNN group. - :rtype: Variable - """ - output = input - for i in range(num_stacks): - if use_gru: - output = bidirectional_gru_bn_layer( - name='layer_{}'.format(i + num_conv_layers), - input=output, - size=size, - act="relu") - else: - name = 'layer_{}'.format(i + num_conv_layers) - output = bidirectional_simple_rnn_bn_layer( - name=name, - input=output, - size=size, - share_weights=share_rnn_weights) - return output - - -def deep_speech_v2_network(audio_data, - text_data, - seq_len_data, - masks, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=256, - use_gru=False, - share_rnn_weights=True): - """The DeepSpeech2 network structure. - - :param audio_data: Audio spectrogram data layer. - :type audio_data: Variable - :param text_data: Transcription text data layer. - :type text_data: Variable - :param seq_len_data: Valid sequence length data layer. - :type seq_len_data: Variable - :param masks: Masks data layer to reset padding. - :type masks: Variable - :param dict_size: Dictionary size for tokenized transcription. - :type dict_size: int - :param num_conv_layers: Number of stacking convolution layers. - :type num_conv_layers: int - :param num_rnn_layers: Number of stacking RNN layers. - :type num_rnn_layers: int - :param rnn_size: RNN layer size (dimension of RNN cells). - :type rnn_size: int - :param use_gru: Use gru if set True. Use simple rnn if set False. - :type use_gru: bool - :param share_rnn_weights: Whether to share input-hidden weights between - forward and backward direction RNNs. - It is only available when use_gru=False. - :type share_weights: bool - :return: A tuple of an output unnormalized log probability layer ( - before softmax) and a ctc cost layer. - :rtype: tuple of LayerOutput - """ - audio_data = fluid.layers.unsqueeze(audio_data, axes=[1]) - - # convolution group - conv_group_output, conv_group_num_channels, conv_group_height, seq_len_data = conv_group( - input=audio_data, - num_stacks=num_conv_layers, - seq_len_data=seq_len_data, - masks=masks) - - # convert data form convolution feature map to sequence of vectors - transpose = fluid.layers.transpose(conv_group_output, perm=[0, 3, 1, 2]) - reshape_conv_output = fluid.layers.reshape( - x=transpose, - shape=[0, -1, conv_group_height * conv_group_num_channels], - inplace=False) - # remove padding part - seq_len_data = fluid.layers.reshape(seq_len_data, [-1]) - sequence = fluid.layers.sequence_unpad( - x=reshape_conv_output, length=seq_len_data) - #rnn group - rnn_group_output = rnn_group( - input=sequence, - size=rnn_size, - num_stacks=num_rnn_layers, - num_conv_layers=num_conv_layers, - use_gru=use_gru, - share_rnn_weights=share_rnn_weights) - fc = fluid.layers.fc( - input=rnn_group_output, - size=dict_size + 1, - act=None, - param_attr=fluid.ParamAttr( - name='layer_{}'.format(num_conv_layers + num_rnn_layers) + - '_fc_weight'), - bias_attr=fluid.ParamAttr( - name='layer_{}'.format(num_conv_layers + num_rnn_layers) + - '_fc_bias')) - # pribability distribution with softmax - log_probs = fluid.layers.softmax(fc) - log_probs.persistable = True - if not text_data: - return log_probs, None - else: - #ctc cost - ctc_loss = fluid.layers.warpctc( - input=fc, label=text_data, blank=dict_size, norm_by_times=True) - ctc_loss = fluid.layers.reduce_sum(ctc_loss) - return log_probs, ctc_loss diff --git a/models/baidu_en8k/download_model.sh b/models/baidu_en8k/download_model.sh deleted file mode 100644 index bbdb32b61..000000000 --- a/models/baidu_en8k/download_model.sh +++ /dev/null @@ -1,19 +0,0 @@ -#! /usr/bin/env bash - -. ../../utils/utility.sh - -URL='https://deepspeech.bj.bcebos.com/demo_models/baidu_en8k_model_fluid.tar.gz' -MD5=7e58fbf64aa4ecf639b049792ddcf788 -TARGET=./baidu_en8k_model_fluid.tar.gz - - -echo "Download BaiduEn8k model ..." -download $URL $MD5 $TARGET -if [ $? -ne 0 ]; then - echo "Fail to download BaiduEn8k model!" - exit 1 -fi -tar -zxvf $TARGET - - -exit 0 diff --git a/requirements.txt b/requirements.txt index 8c57208a6..8ab09f626 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ scipy==1.2.1 -resampy==0.1.5 +resampy==0.2.2 SoundFile==0.9.0.post1 python_speech_features +tensorboardX +yacs diff --git a/setup.sh b/setup.sh index 8d471cbf6..e2c38af77 100644 --- a/setup.sh +++ b/setup.sh @@ -1,5 +1,14 @@ #! /usr/bin/env bash +SUDO='sudo' +if [ $(id -u) -eq 0 ]; then + SUDO='' +fi + +if [ -e /etc/lsb-release ];then + ${SUDO} apt-get install -y pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev +fi + # install python dependencies if [ -f "requirements.txt" ]; then pip3 install -r requirements.txt @@ -29,7 +38,7 @@ fi # install decoders python3 -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")" if [ $? != 0 ]; then - cd decoders/swig > /dev/null + cd deepspeech/decoders/swig > /dev/null sh setup.sh cd - > /dev/null fi diff --git a/test.py b/test.py deleted file mode 100644 index d3b601e98..000000000 --- a/test.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Evaluation for DeepSpeech2 model.""" - -import argparse -import functools -import paddle.fluid as fluid -from data_utils.data import DataGenerator -from model_utils.model import DeepSpeech2Model -from model_utils.model_check import check_cuda, check_version -from utils.error_rate import char_errors, word_errors -from utils.utility import add_arguments, print_arguments - -parser = argparse.ArgumentParser(description=__doc__) -add_arg = functools.partial(add_arguments, argparser=parser) -# yapf: disable -add_arg('batch_size', int, 128, "Minibatch size.") -add_arg('beam_size', int, 500, "Beam search width.") -add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.") -add_arg('num_conv_layers', int, 2, "# of convolution layers.") -add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") -add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") -add_arg('alpha', float, 2.5, "Coef of LM for beam search.") -add_arg('beta', float, 0.3, "Coef of WC for beam search.") -add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.") -add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.") -add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") -add_arg('use_gpu', bool, True, "Use GPU or not.") -add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " - "bi-directional RNNs. Not for GRU.") -add_arg('test_manifest', str, - 'data/librispeech/manifest.test-clean', - "Filepath of manifest to evaluate.") -add_arg('mean_std_path', str, - 'data/librispeech/mean_std.npz', - "Filepath of normalizer's mean & std.") -add_arg('vocab_path', str, - 'data/librispeech/vocab.txt', - "Filepath of vocabulary.") -add_arg('model_path', str, - './checkpoints/libri/step_final', - "If None, the training starts from scratch, " - "otherwise, it resumes from the pre-trained model.") -add_arg('lang_model_path', str, - 'models/lm/common_crawl_00.prune01111.trie.klm', - "Filepath for language model.") -add_arg('decoding_method', str, - 'ctc_beam_search', - "Decoding method. Options: ctc_beam_search, ctc_greedy", - choices = ['ctc_beam_search', 'ctc_greedy']) -add_arg('error_rate_type', str, - 'wer', - "Error rate type for evaluation.", - choices=['wer', 'cer']) -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -# yapf: disable -args = parser.parse_args() - - -def evaluate(): - """Evaluate on whole test data for DeepSpeech2.""" - - # check if set use_gpu=True in paddlepaddle cpu version - check_cuda(args.use_gpu) - # check if paddlepaddle version is satisfied - check_version() - - if args.use_gpu: - place = fluid.CUDAPlace(0) - else: - place = fluid.CPUPlace() - - data_generator = DataGenerator( - vocab_filepath=args.vocab_path, - mean_std_filepath=args.mean_std_path, - augmentation_config='{}', - specgram_type=args.specgram_type, - keep_transcription_text=True, - place = place, - is_training = False) - batch_reader = data_generator.batch_reader_creator( - manifest_path=args.test_manifest, - batch_size=args.batch_size, - sortagrad=False, - shuffle_method=None) - - ds2_model = DeepSpeech2Model( - vocab_size=data_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_layer_size=args.rnn_layer_size, - use_gru=args.use_gru, - share_rnn_weights=args.share_rnn_weights, - place=place, - init_from_pretrained_model=args.model_path) - - # decoders only accept string encoded in utf-8 - vocab_list = [chars for chars in data_generator.vocab_list] - - if args.decoding_method == "ctc_beam_search": - ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, - vocab_list) - errors_func = char_errors if args.error_rate_type == 'cer' else word_errors - errors_sum, len_refs, num_ins = 0.0, 0, 0 - ds2_model.logger.info("start evaluation ...") - for infer_data in batch_reader(): - probs_split = ds2_model.infer_batch_probs( - infer_data=infer_data, - feeding_dict=data_generator.feeding) - - if args.decoding_method == "ctc_greedy": - result_transcripts = ds2_model.decode_batch_greedy( - probs_split=probs_split, - vocab_list=vocab_list) - else: - result_transcripts = ds2_model.decode_batch_beam_search( - probs_split=probs_split, - beam_alpha=args.alpha, - beam_beta=args.beta, - beam_size=args.beam_size, - cutoff_prob=args.cutoff_prob, - cutoff_top_n=args.cutoff_top_n, - vocab_list=vocab_list, - num_processes=args.num_proc_bsearch) - target_transcripts = infer_data[1] - - for target, result in zip(target_transcripts, result_transcripts): - errors, len_ref = errors_func(target, result) - errors_sum += errors - len_refs += len_ref - num_ins += 1 - print("Error rate [%s] (%d/?) = %f" % - (args.error_rate_type, num_ins, errors_sum / len_refs)) - print("Final error rate [%s] (%d/%d) = %f" % - (args.error_rate_type, num_ins, num_ins, errors_sum / len_refs)) - - ds2_model.logger.info("finish evaluation") - -def main(): - print_arguments(args) - evaluate() - - -if __name__ == '__main__': - main() diff --git a/tests/network_test.py b/tests/network_test.py new file mode 100644 index 000000000..7e8d62c2b --- /dev/null +++ b/tests/network_test.py @@ -0,0 +1,105 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import numpy as np + +from deepspeech.models.network import DeepSpeech2 + +if __name__ == '__main__': + + batch_size = 2 + feat_dim = 161 + max_len = 100 + audio = np.random.randn(batch_size, feat_dim, max_len) + audio_len = np.random.randint(100, size=batch_size, dtype='int32') + audio_len[-1] = 100 + text = np.array([[1, 2], [1, 2]], dtype='int32') + text_len = np.array([2] * batch_size, dtype='int32') + + place = paddle.CUDAPlace(0) + audio = paddle.to_tensor( + audio, dtype='float32', place=place, stop_gradient=True) + audio_len = paddle.to_tensor( + audio_len, dtype='int64', place=place, stop_gradient=True) + text = paddle.to_tensor( + text, dtype='int32', place=place, stop_gradient=True) + text_len = paddle.to_tensor( + text_len, dtype='int64', place=place, stop_gradient=True) + + print(audio.shape) + print(audio_len.shape) + print(text.shape) + print(text_len.shape) + print("-----------------") + + model = DeepSpeech2( + feat_size=feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=False, ) + logits, probs, logits_len = model(audio, text, audio_len, text_len) + print('probs.shape', probs.shape) + print("-----------------") + + model2 = DeepSpeech2( + feat_size=feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=True, + share_rnn_weights=False, ) + logits, probs, logits_len = model2(audio, text, audio_len, text_len) + print('probs.shape', probs.shape) + print("-----------------") + + model3 = DeepSpeech2( + feat_size=feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=True, ) + logits, probs, logits_len = model3(audio, text, audio_len, text_len) + print('probs.shape', probs.shape) + print("-----------------") + + model4 = DeepSpeech2( + feat_size=feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=True, + share_rnn_weights=True, ) + logits, probs, logits_len = model4(audio, text, audio_len, text_len) + print('probs.shape', probs.shape) + print("-----------------") + + model5 = DeepSpeech2( + feat_size=feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=False, ) + logits, probs, logits_len = model5(audio, text, audio_len, text_len) + print('probs.shape', probs.shape) + print("-----------------") diff --git a/utils/tests/test_error_rate.py b/tests/test_error_rate.py similarity index 99% rename from utils/tests/test_error_rate.py rename to tests/test_error_rate.py index 80c5b192a..646d5739f 100644 --- a/utils/tests/test_error_rate.py +++ b/tests/test_error_rate.py @@ -14,7 +14,7 @@ """Test error rate.""" import unittest -from utils import error_rate +from deepspeech.utils import error_rate class TestParse(unittest.TestCase): diff --git a/tools/Makefile b/tools/Makefile new file mode 100644 index 000000000..ef721c2b8 --- /dev/null +++ b/tools/Makefile @@ -0,0 +1,13 @@ +PYTHON:= python3.7 +.PHONY: all clean + +all: virtualenv + +virtualenv: + test -d venv || virtualenv -p $(PYTHON) venv + touch venv/bin/activate + +clean: + rm -fr venv + find -iname "*.pyc" -delete + diff --git a/tools/tune.py b/tools/tune.py deleted file mode 100644 index 36443e28b..000000000 --- a/tools/tune.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Beam search parameters tuning for DeepSpeech2 model.""" - -import sys -import os -import numpy as np -import argparse -import functools -import gzip -import logging -import paddle.fluid as fluid -import _init_paths -from data_utils.data import DataGenerator -from model_utils.model import DeepSpeech2Model -from utils.error_rate import char_errors, word_errors -from utils.utility import add_arguments, print_arguments - -parser = argparse.ArgumentParser(description=__doc__) -add_arg = functools.partial(add_arguments, argparser=parser) -# yapf: disable -add_arg('num_batches', int, -1, "# of batches tuning on. " - "Default -1, on whole dev set.") -add_arg('batch_size', int, 256, "# of samples per batch.") -add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") -add_arg('beam_size', int, 500, "Beam search width.") -add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.") -add_arg('num_conv_layers', int, 2, "# of convolution layers.") -add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") -add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") -add_arg('num_alphas', int, 45, "# of alpha candidates for tuning.") -add_arg('num_betas', int, 8, "# of beta candidates for tuning.") -add_arg('alpha_from', float, 1.0, "Where alpha starts tuning from.") -add_arg('alpha_to', float, 3.2, "Where alpha ends tuning with.") -add_arg('beta_from', float, 0.1, "Where beta starts tuning from.") -add_arg('beta_to', float, 0.45, "Where beta ends tuning with.") -add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.") -add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.") -add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") -add_arg('use_gpu', bool, True, "Use GPU or not.") -add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " - "bi-directional RNNs. Not for GRU.") -add_arg('tune_manifest', str, - 'data/librispeech/manifest.dev-clean', - "Filepath of manifest to tune.") -add_arg('mean_std_path', str, - 'data/librispeech/mean_std.npz', - "Filepath of normalizer's mean & std.") -add_arg('vocab_path', str, - 'data/librispeech/vocab.txt', - "Filepath of vocabulary.") -add_arg('lang_model_path', str, - 'models/lm/common_crawl_00.prune01111.trie.klm', - "Filepath for language model.") -add_arg('model_path', str, - './checkpoints/libri/params.latest.tar.gz', - "If None, the training starts from scratch, " - "otherwise, it resumes from the pre-trained model.") -add_arg('error_rate_type', str, - 'wer', - "Error rate type for evaluation.", - choices=['wer', 'cer']) -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -# yapf: disable -args = parser.parse_args() - - -def tune(): - """Tune parameters alpha and beta incrementally.""" - if not args.num_alphas >= 0: - raise ValueError("num_alphas must be non-negative!") - if not args.num_betas >= 0: - raise ValueError("num_betas must be non-negative!") - - if args.use_gpu: - place = fluid.CUDAPlace(0) - else: - place = fluid.CPUPlace() - - data_generator = DataGenerator( - vocab_filepath=args.vocab_path, - mean_std_filepath=args.mean_std_path, - augmentation_config='{}', - specgram_type=args.specgram_type, - keep_transcription_text=True, - place = place, - is_training = False) - - batch_reader = data_generator.batch_reader_creator( - manifest_path=args.tune_manifest, - batch_size=args.batch_size, - sortagrad=False, - shuffle_method=None) - - ds2_model = DeepSpeech2Model( - vocab_size=data_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_layer_size=args.rnn_layer_size, - use_gru=args.use_gru, - place=place, - init_from_pretrained_model=args.model_path, - share_rnn_weights=args.share_rnn_weights) - - # decoders only accept string encoded in utf-8 - vocab_list = [chars for chars in data_generator.vocab_list] - errors_func = char_errors if args.error_rate_type == 'cer' else word_errors - # create grid for search - cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) - cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) - params_grid = [(alpha, beta) for alpha in cand_alphas - for beta in cand_betas] - - err_sum = [0.0 for i in range(len(params_grid))] - err_ave = [0.0 for i in range(len(params_grid))] - num_ins, len_refs, cur_batch = 0, 0, 0 - # initialize external scorer - ds2_model.init_ext_scorer(args.alpha_from, args.beta_from, - args.lang_model_path, vocab_list) - ## incremental tuning parameters over multiple batches - ds2_model.logger.info("start tuning ...") - for infer_data in batch_reader(): - if (args.num_batches >= 0) and (cur_batch >= args.num_batches): - break - probs_split = ds2_model.infer_batch_probs( - infer_data=infer_data, - feeding_dict=data_generator.feeding) - target_transcripts = infer_data[1] - - num_ins += len(target_transcripts) - # grid search - for index, (alpha, beta) in enumerate(params_grid): - result_transcripts = ds2_model.decode_batch_beam_search( - probs_split=probs_split, - beam_alpha=alpha, - beam_beta=beta, - beam_size=args.beam_size, - cutoff_prob=args.cutoff_prob, - cutoff_top_n=args.cutoff_top_n, - vocab_list=vocab_list, - num_processes=args.num_proc_bsearch) - for target, result in zip(target_transcripts, result_transcripts): - errors, len_ref = errors_func(target, result) - err_sum[index] += errors - # accumulate the length of references of every batch - # in the first iteration - if args.alpha_from == alpha and args.beta_from == beta: - len_refs += len_ref - - err_ave[index] = err_sum[index] / len_refs - if index % 2 == 0: - sys.stdout.write('.') - sys.stdout.flush() - - # output on-line tuning result at the end of current batch - err_ave_min = min(err_ave) - min_index = err_ave.index(err_ave_min) - print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), " - " min [%s] = %f" %(cur_batch, num_ins, - "%.3f" % params_grid[min_index][0], - "%.3f" % params_grid[min_index][1], - args.error_rate_type, err_ave_min)) - cur_batch += 1 - - # output WER/CER at every (alpha, beta) - print("\nFinal %s:\n" % args.error_rate_type) - for index in range(len(params_grid)): - print("(alpha, beta) = (%s, %s), [%s] = %f" - % ("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1], - args.error_rate_type, err_ave[index])) - - err_ave_min = min(err_ave) - min_index = err_ave.index(err_ave_min) - print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" - % (cur_batch, "%.3f" % params_grid[min_index][0], - "%.3f" % params_grid[min_index][1])) - - ds2_model.logger.info("finish tuning") - - -def main(): - print_arguments(args) - tune() - - -if __name__ == '__main__': - main() diff --git a/train.py b/train.py deleted file mode 100644 index 067f6d786..000000000 --- a/train.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Trainer for DeepSpeech2 model.""" - -import argparse -import functools -import io -from model_utils.model import DeepSpeech2Model -from model_utils.model_check import check_cuda, check_version -from data_utils.data import DataGenerator -from utils.utility import add_arguments, print_arguments - -import paddle.fluid as fluid - -parser = argparse.ArgumentParser(description=__doc__) -add_arg = functools.partial(add_arguments, argparser=parser) -# yapf: disable -add_arg('batch_size', int, 256, "Minibatch size.") -add_arg('num_epoch', int, 200, "# of training epochs.") -add_arg('num_conv_layers', int, 2, "# of convolution layers.") -add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") -add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") -add_arg('num_iter_print', int, 100, "Every # batch for printing " - "train cost.") -add_arg('save_epoch', int, 10, "# Every # batch for save checkpoint and modle params ") -add_arg('num_samples', int, 10000, "The num of train samples.") -add_arg('learning_rate', float, 5e-4, "Learning rate.") -add_arg('max_duration', float, 27.0, "Longest audio duration allowed.") -add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.") -add_arg('test_off', bool, False, "Turn off testing.") -add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.") -add_arg('use_gpu', bool, True, "Use GPU or not.") -add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") -add_arg('is_local', bool, True, "Use pserver or not.") -add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " - "bi-directional RNNs. Not for GRU.") -add_arg('init_from_pretrained_model',str, - None, - "If None, the training starts from scratch, " - "otherwise, it resumes from the pre-trained model.") - -add_arg('train_manifest', str, - 'data/librispeech/manifest.train', - "Filepath of train manifest.") -add_arg('dev_manifest', str, - 'data/librispeech/manifest.dev-clean', - "Filepath of validation manifest.") -add_arg('mean_std_path', str, - 'data/librispeech/mean_std.npz', - "Filepath of normalizer's mean & std.") -add_arg('vocab_path', str, - 'data/librispeech/vocab.txt', - "Filepath of vocabulary.") -add_arg('output_model_dir', str, - "./checkpoints/libri", - "Directory for saving checkpoints.") -add_arg('augment_conf_path',str, - 'conf/augmentation.config', - "Filepath of augmentation configuration file (json-format).") -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -add_arg('shuffle_method', str, - 'batch_shuffle_clipped', - "Shuffle method.", - choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped']) -# yapf: disable -args = parser.parse_args() - - -def train(): - """DeepSpeech2 training.""" - - # check if set use_gpu=True in paddlepaddle cpu version - check_cuda(args.use_gpu) - # check if paddlepaddle version is satisfied - check_version() - - if args.use_gpu: - place = fluid.CUDAPlace(0) - else: - place = fluid.CPUPlace() - - train_generator = DataGenerator( - vocab_filepath=args.vocab_path, - mean_std_filepath=args.mean_std_path, - augmentation_config=io.open(args.augment_conf_path, mode='r', encoding='utf8').read(), - max_duration=args.max_duration, - min_duration=args.min_duration, - specgram_type=args.specgram_type, - place=place) - dev_generator = DataGenerator( - vocab_filepath=args.vocab_path, - mean_std_filepath=args.mean_std_path, - augmentation_config="{}", - specgram_type=args.specgram_type, - place = place) - train_batch_reader = train_generator.batch_reader_creator( - manifest_path=args.train_manifest, - batch_size=args.batch_size, - sortagrad=args.use_sortagrad if args.init_from_pretrained_model is None else False, - shuffle_method=args.shuffle_method) - dev_batch_reader = dev_generator.batch_reader_creator( - manifest_path=args.dev_manifest, - batch_size=args.batch_size, - sortagrad=False, - shuffle_method=None) - - ds2_model = DeepSpeech2Model( - vocab_size=train_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_layer_size=args.rnn_layer_size, - use_gru=args.use_gru, - share_rnn_weights=args.share_rnn_weights, - place=place, - init_from_pretrained_model=args.init_from_pretrained_model, - output_model_dir=args.output_model_dir) - - ds2_model.train( - train_batch_reader=train_batch_reader, - dev_batch_reader=dev_batch_reader, - feeding_dict=train_generator.feeding, - learning_rate=args.learning_rate, - gradient_clipping=400, - batch_size=args.batch_size, - num_samples=args.num_samples, - num_epoch=args.num_epoch, - save_epoch=args.save_epoch, - num_iterations_print=args.num_iter_print, - test_off=args.test_off) - - -def main(): - print_arguments(args) - train() - - -if __name__ == '__main__': - main() diff --git a/tools/build_vocab.py b/utils/build_vocab.py similarity index 90% rename from tools/build_vocab.py rename to utils/build_vocab.py index 77fd1fb63..cb17de57c 100644 --- a/tools/build_vocab.py +++ b/utils/build_vocab.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Build vocabulary from manifest files. - Each item in vocabulary file is a character. """ @@ -22,16 +21,16 @@ import codecs import json from collections import Counter import os.path -import _init_paths -from data_utils.utility import read_manifest -from utils.utility import add_arguments, print_arguments + +from deepspeech.frontend.utility import read_manifest +from deepspeech.utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable add_arg('count_threshold', int, 0, "Truncation threshold for char counts.") add_arg('vocab_path', str, - 'data/librispeech/vocab.txt', + 'examples/librispeech/data/vocab.txt', "Filepath to write the vocabulary.") add_arg('manifest_paths', str, None, @@ -59,6 +58,7 @@ def main(): count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True) with codecs.open(args.vocab_path, 'w', 'utf-8') as fout: + fout.write('' + '\n') for char, count in count_sorted: if count < args.count_threshold: break fout.write(char + '\n') diff --git a/tools/compute_mean_std.py b/utils/compute_mean_std.py similarity index 87% rename from tools/compute_mean_std.py rename to utils/compute_mean_std.py index d40739f32..80fe88813 100644 --- a/tools/compute_mean_std.py +++ b/utils/compute_mean_std.py @@ -15,11 +15,10 @@ import argparse import functools -import _init_paths -from data_utils.normalizer import FeatureNormalizer -from data_utils.augmentor.augmentation import AugmentationPipeline -from data_utils.featurizer.audio_featurizer import AudioFeaturizer -from utils.utility import add_arguments, print_arguments +from deepspeech.frontend.normalizer import FeatureNormalizer +from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline +from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer +from deepspeech.utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) diff --git a/tools/profile.sh b/utils/profile.sh similarity index 100% rename from tools/profile.sh rename to utils/profile.sh diff --git a/utils/utility.py b/utils/utility.py index cd7166593..b13bc1129 100644 --- a/utils/utility.py +++ b/utils/utility.py @@ -11,47 +11,62 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Contains common utility functions.""" -import distutils.util +import os +import tarfile +import zipfile +from paddle.dataset.common import md5file -def print_arguments(args): - """Print argparse's arguments. +def getfile_insensitive(path): + """Get the actual file path when given insensitive filename.""" + directory, filename = os.path.split(path) + directory, filename = (directory or '.'), filename.lower() + for f in os.listdir(directory): + newpath = os.path.join(directory, f) + if os.path.isfile(newpath) and f.lower() == filename: + return newpath - Usage: - .. code-block:: python +def download_multi(url, target_dir, extra_args): + """Download multiple files from url to target_dir.""" + if not os.path.exists(target_dir): os.makedirs(target_dir) + print("Downloading %s ..." % url) + ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " + + target_dir) + return ret_code - parser = argparse.ArgumentParser() - parser.add_argument("name", default="Jonh", type=str, help="User name.") - args = parser.parse_args() - print_arguments(args) - :param args: Input argparse.Namespace for printing. - :type args: argparse.Namespace - """ - print("----------- Configuration Arguments -----------") - for arg, value in sorted(vars(args).items()): - print("%s: %s" % (arg, value)) - print("------------------------------------------------") +def download(url, md5sum, target_dir): + """Download file from url to target_dir, and check md5sum.""" + if not os.path.exists(target_dir): os.makedirs(target_dir) + filepath = os.path.join(target_dir, url.split("/")[-1]) + if not (os.path.exists(filepath) and md5file(filepath) == md5sum): + print("Downloading %s ..." % url) + os.system("wget -c " + url + " -P " + target_dir) + print("\nMD5 Chesksum %s ..." % filepath) + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) + return filepath -def add_arguments(argname, type, default, help, argparser, **kwargs): - """Add argparse's argument. +def unpack(filepath, target_dir, rm_tar=False): + """Unpack the file to the target_dir.""" + print("Unpacking %s ..." % filepath) + tar = tarfile.open(filepath) + tar.extractall(target_dir) + tar.close() + if rm_tar == True: + os.remove(filepath) - Usage: - .. code-block:: python - - parser = argparse.ArgumentParser() - add_argument("name", str, "Jonh", "User name.", parser) - args = parser.parse_args() - """ - type = distutils.util.strtobool if type == bool else type - argparser.add_argument( - "--" + argname, - default=default, - type=type, - help=help + ' Default: %(default)s.', - **kwargs) +def unzip(filepath, target_dir, rm_tar=False): + """Unzip the file to the target_dir.""" + print("Unpacking %s ..." % filepath) + tar = zipfile.ZipFile(filepath, 'r') + tar.extractall(target_dir) + tar.close() + if rm_tar == True: + os.remove(filepath)