refactor data

5 years ago · a7244593b9
parent 553aa35989
commit a7244593b9
17 changed files with 1315 additions and 424 deletions
--- a/.notebook/dataloader_with_tokens_tokenids.ipynb
+++ b/.notebook/dataloader_with_tokens_tokenids.ipynb
@ -0,0 +1,511 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "downtown-invalid",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'/workspace/DeepSpeech-2.x'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%cd ..\n",
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "emerging-meter",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  def convert_to_list(value, n, name, dtype=np.int):\n"
+     ]
+    }
+   ],
+   "source": [
+    "import math\n",
+    "import random\n",
+    "import tarfile\n",
+    "import logging\n",
+    "import numpy as np\n",
+    "from collections import namedtuple\n",
+    "from functools import partial\n",
+    "\n",
+    "import paddle\n",
+    "from paddle.io import Dataset\n",
+    "from paddle.io import DataLoader\n",
+    "from paddle.io import BatchSampler\n",
+    "from paddle.io import DistributedBatchSampler\n",
+    "from paddle import distributed as dist\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "excessive-american",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "naval-brave",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n",
+      "WARNING:root:register user softmax to paddle, remove this when fixed!\n",
+      "WARNING:root:register user log_softmax to paddle, remove this when fixed!\n",
+      "WARNING:root:register user sigmoid to paddle, remove this when fixed!\n",
+      "WARNING:root:register user log_sigmoid to paddle, remove this when fixed!\n",
+      "WARNING:root:register user relu to paddle, remove this when fixed!\n",
+      "WARNING:root:override cat of paddle if exists or register, remove this when fixed!\n",
+      "WARNING:root:override item of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "WARNING:root:override long of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "WARNING:root:override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "WARNING:root:override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "WARNING:root:override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "WARNING:root:override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
+      "WARNING:root:register user view to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user view_as to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user masked_fill to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user fill_ to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user repeat to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user softmax to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user sigmoid to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user relu to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user type_as to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user to to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user float to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user glu to paddle.nn.functional, remove this when fixed!\n",
+      "WARNING:root:override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
+      "WARNING:root:register user Module to paddle.nn, remove this when fixed!\n",
+      "WARNING:root:register user ModuleList to paddle.nn, remove this when fixed!\n",
+      "WARNING:root:register user GLU to paddle.nn, remove this when fixed!\n",
+      "WARNING:root:register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
+      "WARNING:root:register user export to paddle.jit, remove this when fixed!\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'unit_type': 'char', 'spm_model_prefix': 'examples/tiny/s1/data/spm_bpe', 'infer_manifest': 'examples/tiny/s1/data/manifest.tiny', 'mean_std_path': 'examples/tiny/s1/data/mean_std.npz', 'vocab_path': 'examples/tiny/s1/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/tiny/s1/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'fbank', 'feat_dim': 80, 'delta_delta': False}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import argparse\n",
+    "import functools\n",
+    "from deepspeech.utils.utility import add_arguments, print_arguments\n",
+    "parser = argparse.ArgumentParser(description=__doc__)\n",
+    "add_arg = functools.partial(add_arguments, argparser=parser)\n",
+    "# yapf: disable\n",
+    "add_arg('num_samples',      int,    5,     \"# of samples to infer.\")\n",
+    "add_arg('beam_size',        int,    500,    \"Beam search width.\")\n",
+    "add_arg('num_proc_bsearch', int,    8,      \"# of CPUs for beam search.\")\n",
+    "add_arg('num_conv_layers',  int,    2,      \"# of convolution layers.\")\n",
+    "add_arg('num_rnn_layers',   int,    3,      \"# of recurrent layers.\")\n",
+    "add_arg('rnn_layer_size',   int,    2048,   \"# of recurrent cells per layer.\")\n",
+    "add_arg('alpha',            float,  2.5,    \"Coef of LM for beam search.\")\n",
+    "add_arg('beta',             float,  0.3,    \"Coef of WC for beam search.\")\n",
+    "add_arg('cutoff_prob',      float,  1.0,    \"Cutoff probability for pruning.\")\n",
+    "add_arg('cutoff_top_n',     int,    40,     \"Cutoff number for pruning.\")\n",
+    "add_arg('use_gru',          bool,   False,  \"Use GRUs instead of simple RNNs.\")\n",
+    "add_arg('use_gpu',          bool,   True,   \"Use GPU or not.\")\n",
+    "add_arg('share_rnn_weights',bool,   True,   \"Share input-hidden weights across \"\n",
+    "                                            \"bi-directional RNNs. Not for GRU.\")\n",
+    "add_arg('unit_type',    str,\n",
+    "        'char',\n",
+    "        \"Options: char, word, spm.\",\n",
+    "        choices=['char', 'word', 'spm'])\n",
+    "add_arg('spm_model_prefix',    str,\n",
+    "        'examples/tiny/s1/data/spm_bpe',\n",
+    "        \"spm model prefix.\",)\n",
+    "add_arg('infer_manifest',   str,\n",
+    "        'examples/tiny/s1/data/manifest.tiny',\n",
+    "        \"Filepath of manifest to infer.\")\n",
+    "add_arg('mean_std_path',    str,\n",
+    "        'examples/tiny/s1/data/mean_std.npz',\n",
+    "        \"Filepath of normalizer's mean & std.\")\n",
+    "add_arg('vocab_path',       str,\n",
+    "        'examples/tiny/s1/data/vocab.txt',\n",
+    "        \"Filepath of vocabulary.\")\n",
+    "add_arg('lang_model_path',  str,\n",
+    "        'models/lm/common_crawl_00.prune01111.trie.klm',\n",
+    "        \"Filepath for language model.\")\n",
+    "add_arg('model_path',       str,\n",
+    "        'examples/tiny/s1/checkpoints/step_final',\n",
+    "        \"If None, the training starts from scratch, \"\n",
+    "        \"otherwise, it resumes from the pre-trained model.\")\n",
+    "add_arg('decoding_method',  str,\n",
+    "        'ctc_beam_search',\n",
+    "        \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n",
+    "        choices = ['ctc_beam_search', 'ctc_greedy'])\n",
+    "add_arg('error_rate_type',  str,\n",
+    "        'wer',\n",
+    "        \"Error rate type for evaluation.\",\n",
+    "        choices=['wer', 'cer'])\n",
+    "add_arg('specgram_type',    str,\n",
+    "        'fbank',\n",
+    "        \"Audio feature type. Options: linear, mfcc.\",\n",
+    "        choices=['linear', 'mfcc'])\n",
+    "add_arg('feat_dim',  int,  80, \"mfcc or fbank feat dim.\")\n",
+    "add_arg('delta_delta',  bool,  False, \"delta delta\")\n",
+    "# yapf: disable\n",
+    "args = parser.parse_args([])\n",
+    "print(vars(args))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "bearing-physics",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated.  Instead of using dual, use the functions directly from numpy or scipy.\n",
+      "  from numpy.dual import register_func\n",
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n",
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  long_ = _make_signed(np.long)\n",
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  ulong = _make_unsigned(np.long)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# batch_reader = create_dataloader(\n",
+    "#             manifest_path=args.infer_manifest,\n",
+    "#             vocab_filepath=args.vocab_path,\n",
+    "#             mean_std_filepath=args.mean_std_path,\n",
+    "#             augmentation_config='{}',\n",
+    "#             #max_duration=float('inf'),\n",
+    "#             max_duration=27.0,\n",
+    "#             min_duration=0.0,\n",
+    "#             stride_ms=10.0,\n",
+    "#             window_ms=20.0,\n",
+    "#             max_freq=None,\n",
+    "#             specgram_type=args.specgram_type,\n",
+    "#             use_dB_normalization=True,\n",
+    "#             random_seed=0,\n",
+    "#             keep_transcription_text=True,\n",
+    "#             is_training=False,\n",
+    "#             batch_size=args.num_samples,\n",
+    "#             sortagrad=True,\n",
+    "#             shuffle_method=None,\n",
+    "#             dist=False)\n",
+    "\n",
+    "from deepspeech.frontend.utility import read_manifest\n",
+    "from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline\n",
+    "from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer\n",
+    "from deepspeech.frontend.speech import SpeechSegment\n",
+    "from deepspeech.frontend.normalizer import FeatureNormalizer\n",
+    "\n",
+    "\n",
+    "from deepspeech.io.collator import SpeechCollator\n",
+    "from deepspeech.io.dataset import ManifestDataset\n",
+    "from deepspeech.io.sampler import (\n",
+    "    SortagradDistributedBatchSampler,\n",
+    "    SortagradBatchSampler,\n",
+    ")\n",
+    "from deepspeech.io import create_dataloader\n",
+    "batch_reader = create_dataloader(\n",
+    "                      manifest_path=args.infer_manifest,\n",
+    "                      unit_type=args.unit_type,\n",
+    "                      vocab_filepath=args.vocab_path,\n",
+    "                      mean_std_filepath=args.mean_std_path,\n",
+    "                      spm_model_prefix=args.spm_model_prefix,\n",
+    "                      augmentation_config='{}',\n",
+    "                      max_input_len=27.0,\n",
+    "                      min_input_len=0.0,\n",
+    "                      max_output_len=float('inf'),\n",
+    "                      min_output_len=0.0,\n",
+    "                      max_output_input_ratio=float('inf'),\n",
+    "                      min_output_input_ratio=0.0,\n",
+    "                      stride_ms=10.0,\n",
+    "                      window_ms=20.0,\n",
+    "                      max_freq=None,\n",
+    "                      specgram_type=args.specgram_type,\n",
+    "                      feat_dim=args.feat_dim,\n",
+    "                      delta_delta=args.delta_delta,\n",
+    "                      use_dB_normalization=True,\n",
+    "                      random_seed=0,\n",
+    "                      keep_transcription_text=True,\n",
+    "                      is_training=False,\n",
+    "                      batch_size=args.num_samples,\n",
+    "                      num_workers=0,\n",
+    "                      sortagrad=True,\n",
+    "                      shuffle_method=None,\n",
+    "                      dist=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "classified-melissa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n",
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py:354: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. \n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  if arr.dtype == np.object:\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "test: Tensor(shape=[5, 23], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [[116, 104, 101,  32, 116, 119, 101, 110, 116, 105, 101, 115, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n",
+      "        [119, 104, 101, 114, 101,  32, 105, 115,  32, 116, 104,  97, 116, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n",
+      "        [116, 101, 110,  32, 115, 101,  99, 111, 110, 100, 115, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n",
+      "        [104, 101,  32, 100, 111, 101, 115, 110,  39, 116,  32, 119, 111, 114, 107,  32,  97, 116,  32,  97, 108, 108, -1 ],\n",
+      "        [119, 104, 101, 114, 101,  32, 105, 115,  32, 109, 121,  32,  98, 114, 111, 116, 104, 101, 114,  32, 110, 111, 119]])\n",
+      "test raw: the twenties\n",
+      "test raw: where is my brother now\n",
+      "audio len: Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [163, 173, 184, 190, 203])\n",
+      "test len: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
+      "       [12, 13, 11, 22, 23])\n",
+      "audio: Tensor(shape=[5, 203, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [[[-51.32406616, -17.91388321,  0.00000000 , ..., -26.66350746, -27.46039391, -27.22303963],\n",
+      "         [-15.19027233, -20.52460480,  0.00000000 , ..., -28.47811317, -26.87953568, -25.13592339],\n",
+      "         [-22.80181694, -19.48889351,  0.00000000 , ..., -29.96320724, -25.96619034, -24.57164192],\n",
+      "         ...,\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "\n",
+      "        [[-15.38297653, -18.95307732,  0.00000000 , ..., -15.22777271, -16.46900940, -12.32327461],\n",
+      "         [-14.06289291, -12.69954872,  0.00000000 , ..., -15.68012810, -16.92030334, -13.49134445],\n",
+      "         [-19.78544235, -11.63046265,  0.00000000 , ..., -14.35409069, -14.82787228, -15.72653484],\n",
+      "         ...,\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "\n",
+      "        [[-22.65289879, -21.11938667,  0.00000000 , ..., -31.80981827, -30.58669853, -28.68988228],\n",
+      "         [-31.04699135, -21.68680763,  0.00000000 , ..., -29.90789604, -30.31726456, -30.99709320],\n",
+      "         [-18.16406441, -17.50658417,  0.00000000 , ..., -29.47821617, -29.77137375, -30.45121002],\n",
+      "         ...,\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "\n",
+      "        [[-16.17608452, -15.22302818,  0.00000000 , ..., -8.82944202 , -7.88900328 , -6.10806322 ],\n",
+      "         [-19.40717316, -12.32932186,  0.00000000 , ..., -8.05214977 , -8.03145599 , -7.35137606 ],\n",
+      "         [-11.01850796, -13.20147514,  0.00000000 , ..., -9.65334892 , -8.96987629 , -9.13897228 ],\n",
+      "         ...,\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "\n",
+      "        [[-16.55369759, -16.95514297,  0.00000000 , ..., -7.00301647 , -6.53273058 , -10.14600754],\n",
+      "         [-19.51947975, -14.86818218,  0.00000000 , ..., -6.82891273 , -6.22576237 , -9.42883873 ],\n",
+      "         [-15.26447582, -22.26662445,  0.00000000 , ..., -13.31693172, -11.05612659, -12.70977211],\n",
+      "         ...,\n",
+      "         [-4.81728077 , -10.65084648,  0.00000000 , ...,  3.19982862 ,  8.42359638 ,  7.95100546 ],\n",
+      "         [-7.54755068 , -12.56441689,  0.00000000 , ...,  4.12789631 ,  6.98472023 ,  7.79936218 ],\n",
+      "         [-8.79256725 , -11.23776722,  0.00000000 , ...,  1.31829071 ,  1.30352044 ,  6.80789280 ]]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "for idx, (audio, audio_len, text, text_len) in enumerate(batch_reader()):\n",
+    "    print('test:', text)\n",
+    "    print(\"test raw:\", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))\n",
+    "    print(\"test raw:\", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))\n",
+    "    print('audio len:', audio_len)\n",
+    "    print('test len:', text_len)\n",
+    "    print('audio:', audio)\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "unexpected-skating",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "minus-modern",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "test: Tensor(shape=[5, 23], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [[87, 37, 26,  1, 87, 97, 26, 61, 87, 38, 26, 82, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n",
+      "        [97, 37, 26, 79, 26,  1, 38, 82,  1, 87, 37,  3, 87, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n",
+      "        [87, 26, 61,  1, 82, 26, 18, 64, 61, 25, 82, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n",
+      "        [37, 26,  1, 25, 64, 26, 82, 61,  2, 87,  1, 97, 64, 79, 52,  1,  3, 87,  1,  3, 53, 53, -1],\n",
+      "        [97, 37, 26, 79, 26,  1, 38, 82,  1, 58, 102,  1, 17, 79, 64, 87, 37, 26, 79,  1, 61, 64, 97]])\n",
+      "test raw: W%\u001a\u0001Wa\u001a=W&\u001aR\n",
+      "test raw: a%\u001aO\u001a\u0001&R\u0001:f\u0001\u0011O@W%\u001aO\u0001=@a\n",
+      "audio len: Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [163, 173, 184, 190, 203])\n",
+      "test len: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
+      "       [12, 13, 11, 22, 23])\n",
+      "audio: Tensor(shape=[5, 203, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [[[-51.32406616, -17.91388321,  0.00000000 , ..., -26.66350746, -27.46039391, -27.22303963],\n",
+      "         [-15.19027233, -20.52460480,  0.00000000 , ..., -28.47811317, -26.87953568, -25.13592339],\n",
+      "         [-22.80181694, -19.48889351,  0.00000000 , ..., -29.96320724, -25.96619034, -24.57164192],\n",
+      "         ...,\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "\n",
+      "        [[-15.38297653, -18.95307732,  0.00000000 , ..., -15.22777271, -16.46900940, -12.32327461],\n",
+      "         [-14.06289291, -12.69954872,  0.00000000 , ..., -15.68012810, -16.92030334, -13.49134445],\n",
+      "         [-19.78544235, -11.63046265,  0.00000000 , ..., -14.35409069, -14.82787228, -15.72653484],\n",
+      "         ...,\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "\n",
+      "        [[-22.65289879, -21.11938667,  0.00000000 , ..., -31.80981827, -30.58669853, -28.68988228],\n",
+      "         [-31.04699135, -21.68680763,  0.00000000 , ..., -29.90789604, -30.31726456, -30.99709320],\n",
+      "         [-18.16406441, -17.50658417,  0.00000000 , ..., -29.47821617, -29.77137375, -30.45121002],\n",
+      "         ...,\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "\n",
+      "        [[-16.17608452, -15.22302818,  0.00000000 , ..., -8.82944202 , -7.88900328 , -6.10806322 ],\n",
+      "         [-19.40717316, -12.32932186,  0.00000000 , ..., -8.05214977 , -8.03145599 , -7.35137606 ],\n",
+      "         [-11.01850796, -13.20147514,  0.00000000 , ..., -9.65334892 , -8.96987629 , -9.13897228 ],\n",
+      "         ...,\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
+      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "\n",
+      "        [[-16.55369759, -16.95514297,  0.00000000 , ..., -7.00301647 , -6.53273058 , -10.14600754],\n",
+      "         [-19.51947975, -14.86818218,  0.00000000 , ..., -6.82891273 , -6.22576237 , -9.42883873 ],\n",
+      "         [-15.26447582, -22.26662445,  0.00000000 , ..., -13.31693172, -11.05612659, -12.70977211],\n",
+      "         ...,\n",
+      "         [-4.81728077 , -10.65084648,  0.00000000 , ...,  3.19982862 ,  8.42359638 ,  7.95100546 ],\n",
+      "         [-7.54755068 , -12.56441689,  0.00000000 , ...,  4.12789631 ,  6.98472023 ,  7.79936218 ],\n",
+      "         [-8.79256725 , -11.23776722,  0.00000000 , ...,  1.31829071 ,  1.30352044 ,  6.80789280 ]]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "keep_transcription_text=False\n",
+    "batch_reader = create_dataloader(\n",
+    "                      manifest_path=args.infer_manifest,\n",
+    "                      unit_type=args.unit_type,\n",
+    "                      vocab_filepath=args.vocab_path,\n",
+    "                      mean_std_filepath=args.mean_std_path,\n",
+    "                      spm_model_prefix=args.spm_model_prefix,\n",
+    "                      augmentation_config='{}',\n",
+    "                      max_input_len=27.0,\n",
+    "                      min_input_len=0.0,\n",
+    "                      max_output_len=float('inf'),\n",
+    "                      min_output_len=0.0,\n",
+    "                      max_output_input_ratio=float('inf'),\n",
+    "                      min_output_input_ratio=0.0,\n",
+    "                      stride_ms=10.0,\n",
+    "                      window_ms=20.0,\n",
+    "                      max_freq=None,\n",
+    "                      specgram_type=args.specgram_type,\n",
+    "                      feat_dim=args.feat_dim,\n",
+    "                      delta_delta=args.delta_delta,\n",
+    "                      use_dB_normalization=True,\n",
+    "                      random_seed=0,\n",
+    "                      keep_transcription_text=keep_transcription_text,\n",
+    "                      is_training=False,\n",
+    "                      batch_size=args.num_samples,\n",
+    "                      num_workers=0,\n",
+    "                      sortagrad=True,\n",
+    "                      shuffle_method=None,\n",
+    "                      dist=False)\n",
+    "for idx, (audio, audio_len, text, text_len) in enumerate(batch_reader()):\n",
+    "    print('test:', text)\n",
+    "    print(\"test raw:\", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))\n",
+    "    print(\"test raw:\", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))\n",
+    "    print('audio len:', audio_len)\n",
+    "    print('test len:', text_len)\n",
+    "    print('audio:', audio)\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "chronic-diagram",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/.notebook/python_test.ipynb
+++ b/.notebook/python_test.ipynb
@ -0,0 +1,648 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "choice-lender",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eng=\"one minute a voice said and the time buzzer sounded\"\n",
+    "chn=\"可控是病毒武器最基本的要求\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ruled-kuwait",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "o\n",
+      "n\n",
+      "e\n",
+      " \n",
+      "m\n",
+      "i\n",
+      "n\n",
+      "u\n",
+      "t\n",
+      "e\n",
+      " \n",
+      "a\n",
+      " \n",
+      "v\n",
+      "o\n",
+      "i\n",
+      "c\n",
+      "e\n",
+      " \n",
+      "s\n",
+      "a\n",
+      "i\n",
+      "d\n",
+      " \n",
+      "a\n",
+      "n\n",
+      "d\n",
+      " \n",
+      "t\n",
+      "h\n",
+      "e\n",
+      " \n",
+      "t\n",
+      "i\n",
+      "m\n",
+      "e\n",
+      " \n",
+      "b\n",
+      "u\n",
+      "z\n",
+      "z\n",
+      "e\n",
+      "r\n",
+      " \n",
+      "s\n",
+      "o\n",
+      "u\n",
+      "n\n",
+      "d\n",
+      "e\n",
+      "d\n"
+     ]
+    }
+   ],
+   "source": [
+    "for char in eng:\n",
+    "    print(char)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "passive-petite",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "可\n",
+      "控\n",
+      "是\n",
+      "病\n",
+      "毒\n",
+      "武\n",
+      "器\n",
+      "最\n",
+      "基\n",
+      "本\n",
+      "的\n",
+      "要\n",
+      "求\n"
+     ]
+    }
+   ],
+   "source": [
+    "for char in chn:\n",
+    "    print(char)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "olympic-realtor",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "one\n",
+      "minute\n",
+      "a\n",
+      "voice\n",
+      "said\n",
+      "and\n",
+      "the\n",
+      "time\n",
+      "buzzer\n",
+      "sounded\n"
+     ]
+    }
+   ],
+   "source": [
+    "for word in eng.split():\n",
+    "    print(word)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "induced-enhancement",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "可控是病毒武器最基本的要求\n"
+     ]
+    }
+   ],
+   "source": [
+    "for word in chn.split():\n",
+    "    print(word)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "lovely-bottle",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'StringIO'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-9-3e4825b8299f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mStringIO\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'StringIO'"
+     ]
+    }
+   ],
+   "source": [
+    "import StringIO"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "interested-cardiff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from io import StringIO"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "portable-ivory",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = StringIO()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "compatible-destination",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "64"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inputs.write(\"nor is mister quilter's manner less interesting than his matter\" + '\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "federal-margin",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "nor is mister quilter's manner less interesting than his matternor is mister quilter's manner less interesting than his matter\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(inputs.getvalue())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "consecutive-entity",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "64"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inputs.write(\"nor is mister quilter's manner less interesting than his matter\" + '\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "desirable-anxiety",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "nor is mister quilter's manner less interesting than his matternor is mister quilter's manner less interesting than his matter\n",
+      "nor is mister quilter's manner less interesting than his matter\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(inputs.getvalue())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "employed-schedule",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tempfile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "unlikely-honduras",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__ne__', '__new__', '__next__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_checkClosed', '_checkReadable', '_checkSeekable', '_checkWritable', '_dealloc_warn', '_finalizing', 'close', 'closed', 'detach', 'fileno', 'flush', 'isatty', 'mode', 'name', 'peek', 'raw', 'read', 'read1', 'readable', 'readinto', 'readinto1', 'readline', 'readlines', 'seek', 'seekable', 'tell', 'truncate', 'writable', 'write', 'writelines']\n",
+      "57\n"
+     ]
+    }
+   ],
+   "source": [
+    "with tempfile.TemporaryFile() as fp:\n",
+    "    print(dir(fp))\n",
+    "    print(fp.name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "needed-trail",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = tempfile.mkstemp(suffix=None, prefix='test', dir=None, text=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "hazardous-choir",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['__add__', '__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'count', 'index']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(dir(a))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "front-sauce",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(57, '/tmp/test27smzbzc')\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "shared-wages",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<built-in method index of tuple object at 0x7f999b525648>\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(a.index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "charged-carnival",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_closer', 'close', 'delete', 'file', 'name']\n",
+      "/tmp/tmpfjn7mygy\n"
+     ]
+    }
+   ],
+   "source": [
+    "fp= tempfile.NamedTemporaryFile(mode='w', delete=False)\n",
+    "print(dir(fp))\n",
+    "print(fp.name)\n",
+    "fp.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "religious-terror",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/tmp/tmpfjn7mygy\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "os.path.exists(fp.name)\n",
+    "print(fp.name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "communist-gospel",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<function BufferedRandom.write(buffer, /)>"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fp.write"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "simplified-clarity",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'example'"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "s='/home/ubuntu/python/example.py'\n",
+    "os.path.splitext(os.path.basename(s))[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "popular-genius",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import Counter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "studied-burner",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dict_items([('hello', 1), ('world', 1)])\n"
+     ]
+    }
+   ],
+   "source": [
+    "counter = Counter()\n",
+    "counter.update([\"hello\"])\n",
+    "counter.update([\"world\"])\n",
+    "print(counter.items())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "mineral-ceremony",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dict_items([('h', 1), ('e', 1), ('l', 3), ('o', 2), ('w', 1), ('r', 1), ('d', 1)])\n"
+     ]
+    }
+   ],
+   "source": [
+    "counter = Counter()\n",
+    "counter.update(\"hello\")\n",
+    "counter.update(\"world\")\n",
+    "print(counter.items())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "nonprofit-freedom",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "counter.update(list(\"hello\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "extended-methodology",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dict_items([('h', 2), ('e', 2), ('l', 5), ('o', 3), ('w', 1), ('r', 1), ('d', 1)])\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(counter.items())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "grand-benjamin",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['h', 'e', 'l', 'l', 'o']"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(\"hello\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "id": "marine-fundamentals",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from io import StringIO\n",
+    "a = StringIO(initial_value='{}', newline='')\n",
+    "print(a.read())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "suitable-charlotte",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "expected str, bytes or os.PathLike object, not _io.StringIO",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-56-4323a912120d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: expected str, bytes or os.PathLike object, not _io.StringIO"
+     ]
+    }
+   ],
+   "source": [
+    "with io.open(a) as f:\n",
+    "    print(f.read())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "institutional-configuration",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "io.open?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "pregnant-modem",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py
+++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py
@ -83,27 +83,11 @@ def inference(config, args):

 def start_server(config, args):
    """Start the ASR server"""
-    dataset = ManifestDataset(
-        config.data.test_manifest,
-        config.data.unit_type,
-        config.data.vocab_filepath,
-        config.data.mean_std_filepath,
-        spm_model_prefix=config.data.spm_model_prefix,
-        augmentation_config="{}",
-        max_duration=config.data.max_duration,
-        min_duration=config.data.min_duration,
-        stride_ms=config.data.stride_ms,
-        window_ms=config.data.window_ms,
-        n_fft=config.data.n_fft,
-        max_freq=config.data.max_freq,
-        target_sample_rate=config.data.target_sample_rate,
-        specgram_type=config.data.specgram_type,
-                    feat_dim=config.data.feat_dim,
-            delta_delta=config.data.delat_delta,
-        use_dB_normalization=config.data.use_dB_normalization,
-        target_dB=config.data.target_dB,
-        random_seed=config.data.random_seed,
-        keep_transcription_text=True)
+    config.data.manfiest = config.data.test_manifest
+    config.data.augmentation_config = io.StringIO(
+        initial_value='{}', newline='')
+    config.data.keep_transcription_text = True
+    dataset = ManifestDataset.from_config(config)

    model = DeepSpeech2Model.from_pretrained(dataset, config,
                                             args.checkpoint_path)
--- a/deepspeech/exps/deepspeech2/bin/deploy/server.py
+++ b/deepspeech/exps/deepspeech2/bin/deploy/server.py
@ -35,27 +35,12 @@ from deepspeech.io.dataset import ManifestDataset

 def start_server(config, args):
    """Start the ASR server"""
-    dataset = ManifestDataset(
-        config.data.test_manifest,
-        config.data.unit_type,
-        config.data.vocab_filepath,
-        config.data.mean_std_filepath,
-        spm_model_prefix=config.data.spm_model_prefix,
-        augmentation_config="{}",
-        max_duration=config.data.max_duration,
-        min_duration=config.data.min_duration,
-        stride_ms=config.data.stride_ms,
-        window_ms=config.data.window_ms,
-        n_fft=config.data.n_fft,
-        max_freq=config.data.max_freq,
-        target_sample_rate=config.data.target_sample_rate,
-        specgram_type=config.data.specgram_type,
-                    feat_dim=config.data.feat_dim,
-            delta_delta=config.data.delat_delta,
-        use_dB_normalization=config.data.use_dB_normalization,
-        target_dB=config.data.target_dB,
-        random_seed=config.data.random_seed,
-        keep_transcription_text=True)
+    config.data.manfiest = config.data.test_manifest
+    config.data.augmentation_config = io.StringIO(
+        initial_value='{}', newline='')
+    config.data.keep_transcription_text = True
+    dataset = ManifestDataset.from_config(config)
+
    model = DeepSpeech2Model.from_pretrained(dataset, config,
                                             args.checkpoint_path)
    model.eval()
--- a/deepspeech/exps/deepspeech2/bin/tune.py
+++ b/deepspeech/exps/deepspeech2/bin/tune.py
@ -41,34 +41,18 @@ def tune(config, args):
    if not args.num_betas >= 0:
        raise ValueError("num_betas must be non-negative!")

-    dev_dataset = ManifestDataset(
-        config.data.dev_manifest,
-        config.data.unit_type,
-        config.data.vocab_filepath,
-        config.data.mean_std_filepath,
-        spm_model_prefix=config.data.spm_model_prefix,
-        augmentation_config="{}",
-        max_duration=config.data.max_duration,
-        min_duration=config.data.min_duration,
-        stride_ms=config.data.stride_ms,
-        window_ms=config.data.window_ms,
-        n_fft=config.data.n_fft,
-        max_freq=config.data.max_freq,
-        target_sample_rate=config.data.target_sample_rate,
-        specgram_type=config.data.specgram_type,
-                    feat_dim=config.data.feat_dim,
-            delta_delta=config.data.delat_delta,
-        use_dB_normalization=config.data.use_dB_normalization,
-        target_dB=config.data.target_dB,
-        random_seed=config.data.random_seed,
-        keep_transcription_text=True)
+    config.data.manfiest = config.data.dev_manifest
+    config.data.augmentation_config = io.StringIO(
+        initial_value='{}', newline='')
+    config.data.keep_transcription_text = True
+    dev_dataset = ManifestDataset.from_config(config)

    valid_loader = DataLoader(
        dev_dataset,
        batch_size=config.data.batch_size,
        shuffle=False,
        drop_last=False,
-        collate_fn=SpeechCollator(is_training=False))
+        collate_fn=SpeechCollator(keep_transcription_text=True))

    model = DeepSpeech2Model.from_pretrained(dev_dataset, config,
                                             args.checkpoint_path)
--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
@ -33,8 +33,8 @@ _C.data = CN(
        n_fft=None,  # fft points
        max_freq=None,  # None for samplerate/2
        specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-        feat_dim=0, # 'mfcc', 'fbank'
-        delat_delta=False, # 'mfcc', 'fbank'
+        feat_dim=0,  # 'mfcc', 'fbank'
+        delat_delta=False,  # 'mfcc', 'fbank'
        target_sample_rate=16000,  # target sample rate
        use_dB_normalization=True,
        target_dB=-20,
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -145,52 +145,15 @@ class DeepSpeech2Trainer(Trainer):

    def setup_dataloader(self):
        config = self.config
+        config.data.keep_transcription_text = False

-        train_dataset = ManifestDataset(
-            config.data.train_manifest,
-            config.data.unit_type,
-            config.data.vocab_filepath,
-            config.data.mean_std_filepath,
-            spm_model_prefix=config.data.spm_model_prefix,
-            augmentation_config=io.open(
-                config.data.augmentation_config, mode='r',
-                encoding='utf8').read(),
-            max_duration=config.data.max_duration,
-            min_duration=config.data.min_duration,
-            stride_ms=config.data.stride_ms,
-            window_ms=config.data.window_ms,
-            n_fft=config.data.n_fft,
-            max_freq=config.data.max_freq,
-            target_sample_rate=config.data.target_sample_rate,
-            specgram_type=config.data.specgram_type,
-            feat_dim=config.data.feat_dim,
-            delta_delta=config.data.delat_delta,
-            use_dB_normalization=config.data.use_dB_normalization,
-            target_dB=config.data.target_dB,
-            random_seed=config.data.random_seed,
-            keep_transcription_text=False)
-
-        dev_dataset = ManifestDataset(
-            config.data.dev_manifest,
-            config.data.unit_type,
-            config.data.vocab_filepath,
-            config.data.mean_std_filepath,
-            spm_model_prefix=config.data.spm_model_prefix,
-            augmentation_config="{}",
-            max_duration=config.data.max_duration,
-            min_duration=config.data.min_duration,
-            stride_ms=config.data.stride_ms,
-            window_ms=config.data.window_ms,
-            n_fft=config.data.n_fft,
-            max_freq=config.data.max_freq,
-            target_sample_rate=config.data.target_sample_rate,
-            specgram_type=config.data.specgram_type,
-            feat_dim=config.data.feat_dim,
-            delta_delta=config.data.delat_delta,
-            use_dB_normalization=config.data.use_dB_normalization,
-            target_dB=config.data.target_dB,
-            random_seed=config.data.random_seed,
-            keep_transcription_text=False)
+        config.data.manfiest = config.data.train_manifest
+        train_dataset = ManifestDataset.from_config(config)
+
+        config.data.manfiest = config.data.dev_manifest
+        config.data.augmentation_config = io.StringIO(
+            initial_value='{}', newline='')
+        dev_dataset = ManifestDataset.from_config(config)

        if self.parallel:
            batch_sampler = SortagradDistributedBatchSampler(
@ -211,7 +174,7 @@ class DeepSpeech2Trainer(Trainer):
                sortagrad=config.data.sortagrad,
                shuffle_method=config.data.shuffle_method)

-        collate_fn = SpeechCollator(is_training=True)
+        collate_fn = SpeechCollator(keep_transcription_text=False)
        self.train_loader = DataLoader(
            train_dataset,
            batch_sampler=batch_sampler,
@ -367,27 +330,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
    def setup_dataloader(self):
        config = self.config
        # return raw text
-        test_dataset = ManifestDataset(
-            config.data.test_manifest,
-            config.data.unit_type,
-            config.data.vocab_filepath,
-            config.data.mean_std_filepath,
-            spm_model_prefix=config.data.spm_model_prefix,
-            augmentation_config="{}",
-            max_duration=config.data.max_duration,
-            min_duration=config.data.min_duration,
-            stride_ms=config.data.stride_ms,
-            window_ms=config.data.window_ms,
-            n_fft=config.data.n_fft,
-            max_freq=config.data.max_freq,
-            target_sample_rate=config.data.target_sample_rate,
-            specgram_type=config.data.specgram_type,
-            feat_dim=config.data.feat_dim,
-            delta_delta=config.data.delat_delta,
-            use_dB_normalization=config.data.use_dB_normalization,
-            target_dB=config.data.target_dB,
-            random_seed=config.data.random_seed,
-            keep_transcription_text=True)
+
+        config.data.manfiest = config.data.test_manifest
+        config.data.augmentation_config = io.StringIO(
+            initial_value='{}', newline='')
+        config.data.keep_transcription_text = True
+        test_dataset = ManifestDataset.from_config(config)

        # return text ord id
        self.test_loader = DataLoader(
@ -395,7 +343,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=SpeechCollator(is_training=False))
+            collate_fn=SpeechCollator(keep_transcription_text=True))
        self.logger.info("Setup test Dataloader!")

    def setup_output_dir(self):
--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@ -330,7 +330,7 @@ class AudioFeaturizer(object):
            nfft=512,
            lowfreq=0,
            highfreq=max_freq,
-            preemph=0.97,)
+            preemph=0.97, )
        fbank_feat = np.transpose(fbank_feat)
        if delta_delta:
            fbank_feat = self._concat_delta_delta(fbank_feat)
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@ -48,7 +48,7 @@ class TextFeaturizer(object):
            tokens = self.char_tokenize(text)
        elif self.unit_type == 'word':
            tokens = self.word_tokenize(text)
-        else: # spm
+        else:  # spm
            tokens = self.spm_tokenize(text)
        return tokens

--- a/deepspeech/frontend/speech.py
+++ b/deepspeech/frontend/speech.py
@ -42,6 +42,7 @@ class SpeechSegment(AudioSegment):
        """
        AudioSegment.__init__(self, samples, sample_rate)
        self._transcript = transcript
+        # must init `tokens` with `token_ids` at the same time
        self._tokens = tokens
        self._token_ids = token_ids

@ -183,7 +184,7 @@ class SpeechSegment(AudioSegment):

    @property
    def has_token(self):
-        if self._tokens or self._token_ids:
+        if self._tokens and self._token_ids:
            return True
        return False

--- a/deepspeech/io/init.py
+++ b/deepspeech/io/init.py
@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import functools
+import numpy as np
 from paddle.io import DataLoader

 from deepspeech.io.collator import SpeechCollator
@ -26,12 +28,18 @@ def create_dataloader(manifest_path,
                      mean_std_filepath,
                      spm_model_prefix,
                      augmentation_config='{}',
-                      max_duration=float('inf'),
-                      min_duration=0.0,
+                      max_input_len=float('inf'),
+                      min_input_len=0.0,
+                      max_output_len=float('inf'),
+                      min_output_len=0.0,
+                      max_output_input_ratio=float('inf'),
+                      min_output_input_ratio=0.0,
                      stride_ms=10.0,
                      window_ms=20.0,
                      max_freq=None,
                      specgram_type='linear',
+                      feat_dim=None,
+                      delta_delta=False,
                      use_dB_normalization=True,
                      random_seed=0,
                      keep_transcription_text=False,
@ -43,20 +51,24 @@ def create_dataloader(manifest_path,
                      dist=False):

    dataset = ManifestDataset(
-        manifest_path,
-        unit_type,
-        vocab_filepath,
-        mean_std_filepath,
+        manifest_path=manifest_path,
+        unit_type=unit_type,
+        vocab_filepath=vocab_filepath,
+        mean_std_filepath=mean_std_filepath,
        spm_model_prefix=spm_model_prefix,
        augmentation_config=augmentation_config,
-        max_duration=max_duration,
-        min_duration=min_duration,
+        max_input_len=max_input_len,
+        min_input_len=min_input_len,
+        max_output_len=max_output_len,
+        min_output_len=min_output_len,
+        max_output_input_ratio=max_output_input_ratio,
+        min_output_input_ratio=min_output_input_ratio,
        stride_ms=stride_ms,
        window_ms=window_ms,
        max_freq=max_freq,
        specgram_type=specgram_type,
-                    feat_dim=config.data.feat_dim,
-            delta_delta=config.data.delat_delta,
+        feat_dim=feat_dim,
+        delta_delta=delta_delta,
        use_dB_normalization=use_dB_normalization,
        random_seed=random_seed,
        keep_transcription_text=keep_transcription_text)
@ -80,7 +92,10 @@ def create_dataloader(manifest_path,
            sortagrad=is_training,
            shuffle_method=shuffle_method)

-    def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):
+    def padding_batch(batch,
+                      padding_to=-1,
+                      flatten=False,
+                      keep_transcription_text=True):
        """	
        Padding audio features with zeros to make them have the same shape (or	
        a user-defined shape) within one bach.	
@ -113,10 +128,10 @@ def create_dataloader(manifest_path,
            audio_lens.append(audio.shape[1])

            padded_text = np.zeros([max_text_length])
-            if is_training:
-                padded_text[:len(text)] = text  #ids
-            else:
+            if keep_transcription_text:
                padded_text[:len(text)] = [ord(t) for t in text]  # string
+            else:
+                padded_text[:len(text)] = text  #ids
            texts.append(padded_text)
            text_lens.append(len(text))

@ -124,11 +139,13 @@ def create_dataloader(manifest_path,
        audio_lens = np.array(audio_lens).astype('int64')
        texts = np.array(texts).astype('int32')
        text_lens = np.array(text_lens).astype('int64')
-        return padded_audios, texts, audio_lens, text_lens
+        return padded_audios, audio_lens, texts, text_lens

+    #collate_fn=functools.partial(padding_batch, keep_transcription_text=keep_transcription_text),
+    collate_fn = SpeechCollator(keep_transcription_text=keep_transcription_text)
    loader = DataLoader(
        dataset,
        batch_sampler=batch_sampler,
-        collate_fn=partial(padding_batch, is_training=is_training),
+        collate_fn=collate_fn,
        num_workers=num_workers)
    return loader
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@ -25,14 +25,14 @@ __all__ = ["SpeechCollator"]


 class SpeechCollator():
-    def __init__(self, is_training=True):
+    def __init__(self, keep_transcription_text=True):
        """
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one bach.

-        if ``is_training`` is True, text is token ids else is raw string.
+        if ``keep_transcription_text`` is False, text is token ids else is raw string.
        """
-        self._is_training = is_training
+        self._keep_transcription_text = keep_transcription_text

    def __call__(self, batch):
        """batch examples
@ -61,15 +61,15 @@ class SpeechCollator():
            # for training, text is token ids
            # else text is string, convert to unicode ord
            tokens = []
-            if self._is_training:
-                tokens = text  # token ids
-            else:
-                assert isinstance(text, str)
+            if self._keep_transcription_text:
+                assert isinstance(text, str), type(text)
                tokens = [ord(t) for t in text]
+            else:
+                tokens = text  # token ids
            tokens = tokens if isinstance(tokens, np.ndarray) else np.array(
                tokens, dtype=np.int64)
            texts.append(tokens)
-            text_lens.append(len(text))
+            text_lens.append(tokens.shape[0])

        padded_audios = pad_sequence(
            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import io
 import math
 import random
 import tarfile
@ -43,8 +44,12 @@ class ManifestDataset(Dataset):
                 mean_std_filepath,
                 spm_model_prefix=None,
                 augmentation_config='{}',
-                 max_duration=float('inf'),
-                 min_duration=0.0,
+                 max_input_len=float('inf'),
+                 min_input_len=0.0,
+                 max_output_len=float('inf'),
+                 min_output_len=0.0,
+                 max_output_input_ratio=float('inf'),
+                 min_output_input_ratio=0.0,
                 stride_ms=10.0,
                 window_ms=20.0,
                 n_fft=None,
@ -66,8 +71,12 @@ class ManifestDataset(Dataset):
            mean_std_filepath (str): mean and std file path, which suffix is *.npy
            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
-            max_duration (float, optional): audio length in seconds must less than this. Defaults to float('inf').
-            min_duration (float, optional): audio length is seconds must greater than this. Defaults to 0.0.
+            max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
+            min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
+            max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
+            min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
+            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
+            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
            window_ms (float, optional): window size in ms. Defaults to 20.0.
            n_fft (int, optional): fft points for rfft. Defaults to None.
@ -82,9 +91,13 @@ class ManifestDataset(Dataset):
            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
        """
        super().__init__()
+        self._max_input_len = max_input_len,
+        self._min_input_len = min_input_len,
+        self._max_output_len = max_output_len,
+        self._min_output_len = min_output_len,
+        self._max_output_input_ratio = max_output_input_ratio,
+        self._min_output_input_ratio = min_output_input_ratio,

-        self._max_duration = max_duration
-        self._min_duration = min_duration
        self._normalizer = FeatureNormalizer(mean_std_filepath)
        self._audio_augmentation_pipeline = AugmentationPipeline(
            augmentation_config=augmentation_config, random_seed=random_seed)
@ -102,6 +115,7 @@ class ManifestDataset(Dataset):
            target_sample_rate=target_sample_rate,
            use_dB_normalization=use_dB_normalization,
            target_dB=target_dB)
+
        self._rng = random.Random(random_seed)
        self._keep_transcription_text = keep_transcription_text
        # for caching tar files info
@ -112,9 +126,58 @@ class ManifestDataset(Dataset):
        # read manifest
        self._manifest = read_manifest(
            manifest_path=manifest_path,
-            max_duration=self._max_duration,
-            min_duration=self._min_duration)
-        self._manifest.sort(key=lambda x: x["duration"])
+            max_input_len=max_input_len,
+            min_input_len=min_input_len,
+            max_output_len=max_output_len,
+            min_output_len=min_output_len,
+            max_output_input_ratio=max_output_input_ratio,
+            min_output_input_ratio=min_output_input_ratio)
+        self._manifest.sort(key=lambda x: x["feat_shape"][0])
+
+    @classmethod
+    def from_config(cls, config):
+        """Build a ManifestDataset object from a config.
+
+        Args:
+            config (yacs.config.CfgNode): configs object.
+
+        Returns:
+            ManifestDataset: dataet object.
+        """
+        assert manifest in config.data
+        assert keep_transcription_text in config.data
+        if isinstance(config.data.augmentation_config, (str, bytes)):
+            aug_file = io.open(
+                config.data.augmentation_config, mode='r', encoding='utf8')
+        else:
+            aug_file = config.data.augmentation_config
+            assert isinstance(aug_file, io.StringIO)
+        dataset = cls(
+            manifest_path=config.data.manifest,
+            unit_type=config.data.unit_type,
+            vocab_filepath=config.data.vocab_filepath,
+            mean_std_filepath=config.data.mean_std_filepath,
+            spm_model_prefix=config.data.spm_model_prefix,
+            augmentation_config=aug_file.read(),
+            max_input_len=config.data.max_input_len,
+            min_input_len=config.data.min_input_len,
+            max_output_len=config.data.max_output_len,
+            min_output_len=config.data.min_output_len,
+            max_output_input_ratio=config.data.max_output_input_ratio,
+            min_output_input_ratio=config.data.min_output_input_ratio,
+            stride_ms=config.data.stride_ms,
+            window_ms=config.data.window_ms,
+            n_fft=config.data.n_fft,
+            max_freq=config.data.max_freq,
+            target_sample_rate=config.data.target_sample_rate,
+            specgram_type=config.data.specgram_type,
+            feat_dim=config.data.feat_dim,
+            delta_delta=config.data.delat_delta,
+            use_dB_normalization=config.data.use_dB_normalization,
+            target_dB=config.data.target_dB,
+            random_seed=config.data.random_seed,
+            keep_transcription_text=config.data.keep_transcription_text)
+        return dataset

    @property
    def manifest(self):
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@ -35,7 +35,7 @@ parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
-add_arg('count_threshold',  int,    0, 
+add_arg('count_threshold',  int,    0,
     "Truncation threshold for char/word counts.Default 0, no truncate.")
 add_arg('vocab_path',       str,
        'examples/librispeech/data/vocab.txt',
@ -61,7 +61,7 @@ def count_manifest(counter, text_feature, manifest_path):
    for line_json in manifest_jsons:
        line = text_feature.tokenize(line_json['text'])
        counter.update(line)
-            
+
 def dump_text_manifest(fileobj, manifest_path):
    manifest_jsons = read_manifest(manifest_path)
    for line_json in manifest_jsons:
@ -97,7 +97,7 @@ def main():
    # encode
    text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix)
    counter = Counter()
-    
+
    for manifest_path in args.manifest_paths:
        count_manifest(counter, text_feature, manifest_path)

--- a/utils/combine_data.sh
+++ b/utils/combine_data.sh
@ -1,146 +0,0 @@
-#!/bin/bash
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
-#           2014  David Snyder
-
-# This script combines the data from multiple source directories into
-# a single destination directory.
-
-# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information
-# about what these directories contain.
-
-# Begin configuration section.
-extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..."
-skip_fix=false # skip the fix_data_dir.sh in the end
-# End configuration section.
-
-echo "$0 $@"  # Print the command line for logging
-
-if [ -f path.sh ]; then . ./path.sh; fi
-if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi
-
-if [ $# -lt 2 ]; then
-  echo "Usage: combine_data.sh [--extra-files 'file1 file2'] <dest-data-dir> <src-data-dir1> <src-data-dir2> ..."
-  echo "Note, files that don't appear in all source dirs will not be combined,"
-  echo "with the exception of utt2uniq and segments, which are created where necessary."
-  exit 1
-fi
-
-dest=$1;
-shift;
-
-first_src=$1;
-
-rm -r $dest 2>/dev/null
-mkdir -p $dest;
-
-export LC_ALL=C
-
-for dir in $*; do
-  if [ ! -f $dir/utt2spk ]; then
-    echo "$0: no such file $dir/utt2spk"
-    exit 1;
-  fi
-done
-
-# Check that frame_shift are compatible, where present together with features.
-dir_with_frame_shift=
-for dir in $*; do
-  if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then
-    if [[ $dir_with_frame_shift ]] &&
-       ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then
-      echo "$0:error: different frame_shift in directories $dir and " \
-           "$dir_with_frame_shift. Cannot combine features."
-      exit 1;
-    fi
-    dir_with_frame_shift=$dir
-  fi
-done
-
-# W.r.t. utt2uniq file the script has different behavior compared to other files
-# it is not compulsary for it to exist in src directories, but if it exists in
-# even one it should exist in all. We will create the files where necessary
-has_utt2uniq=false
-for in_dir in $*; do
-  if [ -f $in_dir/utt2uniq ]; then
-    has_utt2uniq=true
-    break
-  fi
-done
-
-if $has_utt2uniq; then
-  # we are going to create an utt2uniq file in the destdir
-  for in_dir in $*; do
-    if [ ! -f $in_dir/utt2uniq ]; then
-      # we assume that utt2uniq is a one to one mapping
-      cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}'
-    else
-      cat $in_dir/utt2uniq
-    fi
-  done | sort -k1 > $dest/utt2uniq
-  echo "$0: combined utt2uniq"
-else
-  echo "$0 [info]: not combining utt2uniq as it does not exist"
-fi
-# some of the old scripts might provide utt2uniq as an extrafile, so just remove it
-extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g")
-
-# segments are treated similarly to utt2uniq. If it exists in some, but not all
-# src directories, then we generate segments where necessary.
-has_segments=false
-for in_dir in $*; do
-  if [ -f $in_dir/segments ]; then
-    has_segments=true
-    break
-  fi
-done
-
-if $has_segments; then
-  for in_dir in $*; do
-    if [ ! -f $in_dir/segments ]; then
-      echo "$0 [info]: will generate missing segments for $in_dir" 1>&2
-      utils/data/get_segments_for_data.sh $in_dir
-    else
-      cat $in_dir/segments
-    fi
-  done | sort -k1 > $dest/segments
-  echo "$0: combined segments"
-else
-  echo "$0 [info]: not combining segments as it does not exist"
-fi
-
-for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
-  exists_somewhere=false
-  absent_somewhere=false
-  for d in $*; do
-    if [ -f $d/$file ]; then
-      exists_somewhere=true
-    else
-      absent_somewhere=true
-      fi
-  done
-
-  if ! $absent_somewhere; then
-    set -o pipefail
-    ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1;
-    set +o pipefail
-    echo "$0: combined $file"
-  else
-    if ! $exists_somewhere; then
-      echo "$0 [info]: not combining $file as it does not exist"
-    else
-      echo "$0 [info]: **not combining $file as it does not exist everywhere**"
-    fi
-  fi
-done
-
-tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt
-
-if [[ $dir_with_frame_shift ]]; then
-  cp $dir_with_frame_shift/frame_shift $dest
-fi
-
-if ! $skip_fix ; then
-  tools/fix_data_dir.sh $dest || exit 1;
-fi
-
-exit 0
--- a/utils/format_data.py
+++ b/utils/format_data.py
@ -85,7 +85,7 @@ def main():
                raise NotImplemented('no support kaldi feat now!')
            fout.write(json.dumps(line_json) + '\n')
            count += 1
-            
+
    print(f"Examples number: {count}")
    fout.close()

--- a/utils/sym2int.pl
+++ b/utils/sym2int.pl
@ -1,104 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-$ignore_oov = 0;
-
-for($x = 0; $x < 2; $x++) {
-  if ($ARGV[0] eq "--map-oov") {
-    shift @ARGV;
-    $map_oov = shift @ARGV;
-    if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") {
-      # disallow '-f', the empty string and anything ending in words.txt as the
-      # OOV symbol because these are likely command-line errors.
-      die "the --map-oov option requires an argument";
-    }
-  }
-  if ($ARGV[0] eq "-f") {
-    shift @ARGV;
-    $field_spec = shift @ARGV;
-    if ($field_spec =~ m/^\d+$/) {
-      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
-    }
-    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
-      if ($1 ne "") {
-        $field_begin = $1 - 1;  # Change to zero-based indexing.
-      }
-      if ($2 ne "") {
-        $field_end = $2 - 1;    # Change to zero-based indexing.
-      }
-    }
-    if (!defined $field_begin && !defined $field_end) {
-      die "Bad argument to -f option: $field_spec";
-    }
-  }
-}
-
-$symtab = shift @ARGV;
-if (!defined $symtab) {
-  print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" .
-    "options: [--map-oov <oov-symbol> ]  [-f <field-range> ]\n" .
-      "note: <field-range> can look like 4-5, or 4-, or 5-, or 1.\n";
-}
-open(F, "<$symtab") || die "Error opening symbol table file $symtab";
-while(<F>) {
-    @A = split(" ", $_);
-    @A == 2 || die "bad line in symbol table file: $_";
-    $sym2int{$A[0]} = $A[1] + 0;
-}
-
-if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up
-  if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; }
-  $map_oov = $sym2int{$map_oov};
-}
-
-$num_warning = 0;
-$max_warning = 20;
-
-while (<>) {
-  @A = split(" ", $_);
-  @B = ();
-  for ($n = 0; $n < @A; $n++) {
-    $a = $A[$n];
-    if ( (!defined $field_begin || $n >= $field_begin)
-         && (!defined $field_end || $n <= $field_end)) {
-      $i = $sym2int{$a};
-      if (!defined ($i)) {
-        if (defined $map_oov) {
-          if ($num_warning++ < $max_warning) {
-            print STDERR "sym2int.pl: replacing $a with $map_oov\n";
-            if ($num_warning == $max_warning) {
-              print STDERR "sym2int.pl: not warning for OOVs any more times\n";
-            }
-          }
-          $i = $map_oov;
-        } else {
-          $pos = $n+1;
-          die "sym2int.pl: undefined symbol $a (in position $pos)\n";
-        }
-      }
-      $a = $i;
-    }
-    push @B, $a;
-  }
-  print join(" ", @B);
-  print "\n";
-}
-if ($num_warning > 0) {
-  print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n";
-}
-
-exit(0);