diff --git a/audio_data_utils.py b/audio_data_utils.py
deleted file mode 100644
index 1cd29be11..000000000
--- a/audio_data_utils.py
+++ /dev/null
@@ -1,411 +0,0 @@
-"""
-    Providing basic audio data preprocessing pipeline, and offering
-    both instance-level and batch-level data reader interfaces.
-"""
-import paddle.v2 as paddle
-import logging
-import json
-import random
-import soundfile
-import numpy as np
-import itertools
-import os
-
-RANDOM_SEED = 0
-logger = logging.getLogger(__name__)
-
-
-class DataGenerator(object):
-    """
-    DataGenerator provides basic audio data preprocessing pipeline, and offers
-    both instance-level and batch-level data reader interfaces.
-    Normalized FFT are used as audio features here.
-
-    :param vocab_filepath: Vocabulary file path for indexing tokenized
-                           transcriptions.
-    :type vocab_filepath: basestring
-    :param normalizer_manifest_path: Manifest filepath for collecting feature
-                                     normalization statistics, e.g. mean, std.
-    :type normalizer_manifest_path: basestring
-    :param normalizer_num_samples: Number of instances sampled for collecting
-                                   feature normalization statistics.
-                                   Default is 100.
-    :type normalizer_num_samples: int
-    :param max_duration: Audio clips with duration (in seconds) greater than
-                         this will be discarded. Default is 20.0.
-    :type max_duration: float
-    :param min_duration: Audio clips with duration (in seconds) smaller than
-                         this will be discarded. Default is 0.0.
-    :type min_duration: float
-    :param stride_ms: Striding size (in milliseconds) for generating frames.
-                      Default is 10.0. 
-    :type stride_ms: float
-    :param window_ms: Window size (in milliseconds) for frames. Default is 20.0.
-    :type window_ms: float
-    :param max_frequency: Maximun frequency for FFT features. FFT features of
-                          frequency larger than this will be discarded.
-                          If set None, all features will be kept.
-                          Default is None.
-    :type max_frequency: float
-    """
-
-    def __init__(self,
-                 vocab_filepath,
-                 normalizer_manifest_path,
-                 normalizer_num_samples=100,
-                 max_duration=20.0,
-                 min_duration=0.0,
-                 stride_ms=10.0,
-                 window_ms=20.0,
-                 max_frequency=None):
-        self.__max_duration__ = max_duration
-        self.__min_duration__ = min_duration
-        self.__stride_ms__ = stride_ms
-        self.__window_ms__ = window_ms
-        self.__max_frequency__ = max_frequency
-        self.__epoc__ = 0
-        self.__random__ = random.Random(RANDOM_SEED)
-        # load vocabulary (dictionary)
-        self.__vocab_dict__, self.__vocab_list__ = \
-            self.__load_vocabulary_from_file__(vocab_filepath)
-        # collect normalizer statistics
-        self.__mean__, self.__std__ = self.__collect_normalizer_statistics__(
-            manifest_path=normalizer_manifest_path,
-            num_samples=normalizer_num_samples)
-
-    def __audio_featurize__(self, audio_filename):
-        """
-        Preprocess audio data, including feature extraction, normalization etc..
-        """
-        features = self.__audio_basic_featurize__(audio_filename)
-        return self.__normalize__(features)
-
-    def __text_featurize__(self, text):
-        """
-        Preprocess text data, including tokenizing and token indexing etc..
-        """
-        return self.__convert_text_to_char_index__(
-            text=text, vocabulary=self.__vocab_dict__)
-
-    def __audio_basic_featurize__(self, audio_filename):
-        """
-        Compute basic (without normalization etc.) features for audio data.
-        """
-        return self.__spectrogram_from_file__(
-            filename=audio_filename,
-            stride_ms=self.__stride_ms__,
-            window_ms=self.__window_ms__,
-            max_freq=self.__max_frequency__)
-
-    def __collect_normalizer_statistics__(self, manifest_path, num_samples=100):
-        """
-        Compute feature normalization statistics, i.e. mean and stddev.
-        """
-        # read manifest
-        manifest = self.__read_manifest__(
-            manifest_path=manifest_path,
-            max_duration=self.__max_duration__,
-            min_duration=self.__min_duration__)
-        # sample for statistics
-        sampled_manifest = self.__random__.sample(manifest, num_samples)
-        # extract spectrogram feature
-        features = []
-        for instance in sampled_manifest:
-            spectrogram = self.__audio_basic_featurize__(
-                instance["audio_filepath"])
-            features.append(spectrogram)
-        features = np.hstack(features)
-        mean = np.mean(features, axis=1).reshape([-1, 1])
-        std = np.std(features, axis=1).reshape([-1, 1])
-        return mean, std
-
-    def __normalize__(self, features, eps=1e-14):
-        """
-        Normalize features to be of zero mean and unit stddev.
-        """
-        return (features - self.__mean__) / (self.__std__ + eps)
-
-    def __spectrogram_from_file__(self,
-                                  filename,
-                                  stride_ms=10.0,
-                                  window_ms=20.0,
-                                  max_freq=None,
-                                  eps=1e-14):
-        """
-        Laod audio data and calculate the log of spectrogram by FFT.
-        Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech
-        """
-        audio, sample_rate = soundfile.read(filename)
-        if audio.ndim >= 2:
-            audio = np.mean(audio, 1)
-        if max_freq is None:
-            max_freq = sample_rate / 2
-        if max_freq > sample_rate / 2:
-            raise ValueError("max_freq must be greater than half of "
-                             "sample rate.")
-        if stride_ms > window_ms:
-            raise ValueError("Stride size must not be greater than "
-                             "window size.")
-        stride_size = int(0.001 * sample_rate * stride_ms)
-        window_size = int(0.001 * sample_rate * window_ms)
-        spectrogram, freqs = self.__extract_spectrogram__(
-            audio,
-            window_size=window_size,
-            stride_size=stride_size,
-            sample_rate=sample_rate)
-        ind = np.where(freqs <= max_freq)[0][-1] + 1
-        return np.log(spectrogram[:ind, :] + eps)
-
-    def __extract_spectrogram__(self, samples, window_size, stride_size,
-                                sample_rate):
-        """
-        Compute the spectrogram by FFT for a discrete real signal.
-        Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech
-        """
-        # extract strided windows
-        truncate_size = (len(samples) - window_size) % stride_size
-        samples = samples[:len(samples) - truncate_size]
-        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
-        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
-        windows = np.lib.stride_tricks.as_strided(
-            samples, shape=nshape, strides=nstrides)
-        assert np.all(
-            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
-        # window weighting, squared Fast Fourier Transform (fft), scaling
-        weighting = np.hanning(window_size)[:, None]
-        fft = np.fft.rfft(windows * weighting, axis=0)
-        fft = np.absolute(fft)**2
-        scale = np.sum(weighting**2) * sample_rate
-        fft[1:-1, :] *= (2.0 / scale)
-        fft[(0, -1), :] /= scale
-        # prepare fft frequency list
-        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
-        return fft, freqs
-
-    def __load_vocabulary_from_file__(self, vocabulary_path):
-        """
-        Load vocabulary from file.
-        """
-        if not os.path.exists(vocabulary_path):
-            raise ValueError("Vocabulary file %s not found.", vocabulary_path)
-        vocab_lines = []
-        with open(vocabulary_path, 'r') as file:
-            vocab_lines.extend(file.readlines())
-        vocab_list = [line[:-1] for line in vocab_lines]
-        vocab_dict = dict(
-            [(token, id) for (id, token) in enumerate(vocab_list)])
-        return vocab_dict, vocab_list
-
-    def __convert_text_to_char_index__(self, text, vocabulary):
-        """
-        Convert text string to a list of character index integers.
-        """
-        return [vocabulary[w] for w in text]
-
-    def __read_manifest__(self, manifest_path, max_duration, min_duration):
-        """
-        Load and parse manifest file.
-        """
-        manifest = []
-        for json_line in open(manifest_path):
-            try:
-                json_data = json.loads(json_line)
-            except Exception as e:
-                raise ValueError("Error reading manifest: %s" % str(e))
-            if (json_data["duration"] <= max_duration and
-                    json_data["duration"] >= min_duration):
-                manifest.append(json_data)
-        return manifest
-
-    def __padding_batch__(self, batch, padding_to=-1, flatten=False):
-        """
-        Padding audio part of features (only in the time axis -- column axis)
-        with zeros, to make each instance in the batch share the same
-        audio feature shape.
-
-        If `padding_to` is set -1, the maximun column numbers in the batch will
-        be used as the target size. Otherwise, `padding_to` will be the target
-        size. Default is -1.
-
-        If `flatten` is set True, audio data will be flatten to be a 1-dim
-        ndarray. Default is False.
-        """
-        new_batch = []
-        # get target shape
-        max_length = max([audio.shape[1] for audio, text in batch])
-        if padding_to != -1:
-            if padding_to < max_length:
-                raise ValueError("If padding_to is not -1, it should be greater"
-                                 " or equal to the original instance length.")
-            max_length = padding_to
-        # padding
-        for audio, text in batch:
-            padded_audio = np.zeros([audio.shape[0], max_length])
-            padded_audio[:, :audio.shape[1]] = audio
-            if flatten:
-                padded_audio = padded_audio.flatten()
-            new_batch.append((padded_audio, text))
-        return new_batch
-
-    def __batch_shuffle__(self, manifest, batch_size):
-        """
-        The instances have different lengths and they cannot be
-        combined into a single matrix multiplication. It usually
-        sorts the training examples by length and combines only
-        similarly-sized instances into minibatches, pads with
-        silence when necessary so that all instances in a batch
-        have the same length. This batch shuffle fuction is used
-        to make similarly-sized instances into minibatches and
-        make a batch-wise shuffle.
-
-        1. Sort the audio clips by duration.
-        2. Generate a random number `k`, k in [0, batch_size).
-        3. Randomly remove `k` instances in order to make different mini-batches,
-           then make minibatches and each minibatch size is batch_size.
-        4. Shuffle the minibatches.
-
-        :param manifest: manifest file.
-        :type manifest: list
-        :param batch_size: Batch size. This size is also used for generate
-                           a random number for batch shuffle.
-        :type batch_size: int
-        :return: batch shuffled mainifest.
-        :rtype: list
-        """
-        manifest.sort(key=lambda x: x["duration"])
-        shift_len = self.__random__.randint(0, batch_size - 1)
-        batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
-        self.__random__.shuffle(batch_manifest)
-        batch_manifest = list(sum(batch_manifest, ()))
-        res_len = len(manifest) - shift_len - len(batch_manifest)
-        batch_manifest.extend(manifest[-res_len:])
-        batch_manifest.extend(manifest[0:shift_len])
-        return batch_manifest
-
-    def instance_reader_creator(self, manifest):
-        """
-        Instance reader creator for audio data. Creat a callable function to
-        produce instances of data.
-
-        Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
-        tokenized and indexed transcription text.
-
-        :param manifest: Filepath of manifest for audio clip files.
-        :type manifest: basestring
-        :return: Data reader function.
-        :rtype: callable
-        """
-
-        def reader():
-            # extract spectrogram feature
-            for instance in manifest:
-                spectrogram = self.__audio_featurize__(
-                    instance["audio_filepath"])
-                transcript = self.__text_featurize__(instance["text"])
-                yield (spectrogram, transcript)
-
-        return reader
-
-    def batch_reader_creator(self,
-                             manifest_path,
-                             batch_size,
-                             padding_to=-1,
-                             flatten=False,
-                             sortagrad=False,
-                             batch_shuffle=False):
-        """
-        Batch data reader creator for audio data. Creat a callable function to
-        produce batches of data.
-        
-        Audio features will be padded with zeros to make each instance in the
-        batch to share the same audio feature shape.
-
-        :param manifest_path: Filepath of manifest for audio clip files.
-        :type manifest_path: basestring
-        :param batch_size: Instance number in a batch.
-        :type batch_size: int
-        :param padding_to:  If set -1, the maximun column numbers in the batch
-                            will be used as the target size for padding.
-                            Otherwise, `padding_to` will be the target size.
-                            Default is -1.
-        :type padding_to: int
-        :param flatten: If set True, audio data will be flatten to be a 1-dim
-                        ndarray. Otherwise, 2-dim ndarray. Default is False.
-        :type flatten: bool
-        :param sortagrad: Sort the audio clips by duration in the first epoc
-                          if set True.
-        :type sortagrad: bool
-        :param batch_shuffle: Shuffle the audio clips if set True. It is
-                              not a thorough instance-wise shuffle, but a
-                              specific batch-wise shuffle. For more details,
-                              please see `__batch_shuffle__` function.
-        :type batch_shuffle: bool
-        :return: Batch reader function, producing batches of data when called.
-        :rtype: callable
-        """
-
-        def batch_reader():
-            # read manifest
-            manifest = self.__read_manifest__(
-                manifest_path=manifest_path,
-                max_duration=self.__max_duration__,
-                min_duration=self.__min_duration__)
-
-            # sort (by duration) or shuffle manifest
-            if self.__epoc__ == 0 and sortagrad:
-                manifest.sort(key=lambda x: x["duration"])
-            elif batch_shuffle:
-                manifest = self.__batch_shuffle__(manifest, batch_size)
-
-            instance_reader = self.instance_reader_creator(manifest)
-            batch = []
-            for instance in instance_reader():
-                batch.append(instance)
-                if len(batch) == batch_size:
-                    yield self.__padding_batch__(batch, padding_to, flatten)
-                    batch = []
-            if len(batch) > 0:
-                yield self.__padding_batch__(batch, padding_to, flatten)
-            self.__epoc__ += 1
-
-        return batch_reader
-
-    def vocabulary_size(self):
-        """
-        Get vocabulary size.
-
-        :return: Vocabulary size.
-        :rtype: int
-        """
-        return len(self.__vocab_list__)
-
-    def vocabulary_dict(self):
-        """
-        Get vocabulary in dict.
-
-        :return: Vocabulary in dict.
-        :rtype: dict
-        """
-        return self.__vocab_dict__
-
-    def vocabulary_list(self):
-        """
-        Get vocabulary in list.
-
-        :return: Vocabulary in list
-        :rtype: list
-        """
-        return self.__vocab_list__
-
-    def data_name_feeding(self):
-        """
-        Get feeddings (data field name and corresponding field id).
-
-        :return: Feeding dict.
-        :rtype: dict
-        """
-        feeding = {
-            "audio_spectrogram": 0,
-            "transcript_text": 1,
-        }
-        return feeding
diff --git a/compute_mean_std.py b/compute_mean_std.py
new file mode 100755
index 000000000..b3015df73
--- /dev/null
+++ b/compute_mean_std.py
@@ -0,0 +1,56 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+from data_utils.normalizer import FeatureNormalizer
+from data_utils.augmentor.augmentation import AugmentationPipeline
+from data_utils.featurizer.audio_featurizer import AudioFeaturizer
+
+parser = argparse.ArgumentParser(
+    description='Computing mean and stddev for feature normalizer.')
+parser.add_argument(
+    "--manifest_path",
+    default='datasets/manifest.train',
+    type=str,
+    help="Manifest path for computing normalizer's mean and stddev."
+    "(default: %(default)s)")
+parser.add_argument(
+    "--num_samples",
+    default=500,
+    type=int,
+    help="Number of samples for computing mean and stddev. "
+    "(default: %(default)s)")
+parser.add_argument(
+    "--augmentation_config",
+    default='{}',
+    type=str,
+    help="Augmentation configuration in json-format. "
+    "(default: %(default)s)")
+parser.add_argument(
+    "--output_file",
+    default='mean_std.npz',
+    type=str,
+    help="Filepath to write mean and std to (.npz)."
+    "(default: %(default)s)")
+args = parser.parse_args()
+
+
+def main():
+    augmentation_pipeline = AugmentationPipeline(args.augmentation_config)
+    audio_featurizer = AudioFeaturizer()
+
+    def augment_and_featurize(audio_segment):
+        augmentation_pipeline.transform_audio(audio_segment)
+        return audio_featurizer.featurize(audio_segment)
+
+    normalizer = FeatureNormalizer(
+        mean_std_filepath=None,
+        manifest_path=args.manifest_path,
+        featurize_func=augment_and_featurize,
+        num_samples=args.num_samples)
+    normalizer.write_to_file(args.output_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/data_utils/__init__.py b/data_utils/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/data_utils/audio.py b/data_utils/audio.py
new file mode 100755
index 000000000..46b241201
--- /dev/null
+++ b/data_utils/audio.py
@@ -0,0 +1,68 @@
+import numpy as np
+import io
+import soundfile
+
+
+class AudioSegment(object):
+    """Monaural audio segment abstraction.
+    """
+
+    def __init__(self, samples, sample_rate):
+        if not samples.dtype == np.float32:
+            raise ValueError("Sample data type of [%s] is not supported.")
+        self._samples = samples
+        self._sample_rate = sample_rate
+        if self._samples.ndim >= 2:
+            self._samples = np.mean(self._samples, 1)
+
+    @classmethod
+    def from_file(cls, filepath):
+        samples, sample_rate = soundfile.read(filepath, dtype='float32')
+        return cls(samples, sample_rate)
+
+    @classmethod
+    def from_bytes(cls, bytes):
+        samples, sample_rate = soundfile.read(
+            io.BytesIO(bytes), dtype='float32')
+        return cls(samples, sample_rate)
+
+    def apply_gain(self, gain):
+        self.samples *= 10.**(gain / 20.)
+
+    def resample(self, target_sample_rate):
+        raise NotImplementedError()
+
+    def change_speed(self, rate):
+        raise NotImplementedError()
+
+    @property
+    def samples(self):
+        return self._samples.copy()
+
+    @property
+    def sample_rate(self):
+        return self._sample_rate
+
+    @property
+    def duration(self):
+        return self._samples.shape[0] / float(self._sample_rate)
+
+
+class SpeechSegment(AudioSegment):
+    def __init__(self, samples, sample_rate, transcript):
+        AudioSegment.__init__(self, samples, sample_rate)
+        self._transcript = transcript
+
+    @classmethod
+    def from_file(cls, filepath, transcript):
+        audio = AudioSegment.from_file(filepath)
+        return cls(audio.samples, audio.sample_rate, transcript)
+
+    @classmethod
+    def from_bytes(cls, bytes, transcript):
+        audio = AudioSegment.from_bytes(bytes)
+        return cls(audio.samples, audio.sample_rate, transcript)
+
+    @property
+    def transcript(self):
+        return self._transcript
diff --git a/data_utils/augmentor/__init__.py b/data_utils/augmentor/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py
new file mode 100755
index 000000000..3a1426a1f
--- /dev/null
+++ b/data_utils/augmentor/augmentation.py
@@ -0,0 +1,38 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import random
+from data_utils.augmentor.volumn_perturb import VolumnPerturbAugmentor
+
+
+class AugmentationPipeline(object):
+    def __init__(self, augmentation_config, random_seed=0):
+        self._rng = random.Random(random_seed)
+        self._augmentors, self._rates = self._parse_pipeline_from(
+            augmentation_config)
+
+    def transform_audio(self, audio_segment):
+        for augmentor, rate in zip(self._augmentors, self._rates):
+            if self._rng.uniform(0., 1.) <= rate:
+                augmentor.transform_audio(audio_segment)
+
+    def _parse_pipeline_from(self, config_json):
+        try:
+            configs = json.loads(config_json)
+        except Exception as e:
+            raise ValueError("Augmentation config json format error: "
+                             "%s" % str(e))
+        augmentors = [
+            self._get_augmentor(config["type"], config["params"])
+            for config in configs
+        ]
+        rates = [config["rate"] for config in configs]
+        return augmentors, rates
+
+    def _get_augmentor(self, augmentor_type, params):
+        if augmentor_type == "volumn":
+            return VolumnPerturbAugmentor(self._rng, **params)
+        else:
+            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
diff --git a/data_utils/augmentor/base.py b/data_utils/augmentor/base.py
new file mode 100755
index 000000000..e801b9b18
--- /dev/null
+++ b/data_utils/augmentor/base.py
@@ -0,0 +1,17 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from abc import ABCMeta, abstractmethod
+
+
+class AugmentorBase(object):
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def transform_audio(self, audio_segment):
+        pass
diff --git a/data_utils/augmentor/volumn_perturb.py b/data_utils/augmentor/volumn_perturb.py
new file mode 100755
index 000000000..dd1ba53a7
--- /dev/null
+++ b/data_utils/augmentor/volumn_perturb.py
@@ -0,0 +1,17 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+from data_utils.augmentor.base import AugmentorBase
+
+
+class VolumnPerturbAugmentor(AugmentorBase):
+    def __init__(self, rng, min_gain_dBFS, max_gain_dBFS):
+        self._min_gain_dBFS = min_gain_dBFS
+        self._max_gain_dBFS = max_gain_dBFS
+        self._rng = rng
+
+    def transform_audio(self, audio_segment):
+        gain = self._rng.uniform(min_gain_dBFS, max_gain_dBFS)
+        audio_segment.apply_gain(gain)
diff --git a/data_utils/data.py b/data_utils/data.py
new file mode 100644
index 000000000..630007932
--- /dev/null
+++ b/data_utils/data.py
@@ -0,0 +1,247 @@
+"""
+    Providing basic audio data preprocessing pipeline, and offering
+    both instance-level and batch-level data reader interfaces.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import numpy as np
+import paddle.v2 as paddle
+from data_utils import utils
+from data_utils.augmentor.augmentation import AugmentationPipeline
+from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
+from data_utils.audio import SpeechSegment
+from data_utils.normalizer import FeatureNormalizer
+
+
+class DataGenerator(object):
+    """
+    DataGenerator provides basic audio data preprocessing pipeline, and offers
+    both instance-level and batch-level data reader interfaces.
+    Normalized FFT are used as audio features here.
+
+    :param vocab_filepath: Vocabulary file path for indexing tokenized
+                           transcriptions.
+    :type vocab_filepath: basestring
+    :param normalizer_manifest_path: Manifest filepath for collecting feature
+                                     normalization statistics, e.g. mean, std.
+    :type normalizer_manifest_path: basestring
+    :param normalizer_num_samples: Number of instances sampled for collecting
+                                   feature normalization statistics.
+                                   Default is 100.
+    :type normalizer_num_samples: int
+    :param max_duration: Audio clips with duration (in seconds) greater than
+                         this will be discarded. Default is 20.0.
+    :type max_duration: float
+    :param min_duration: Audio clips with duration (in seconds) smaller than
+                         this will be discarded. Default is 0.0.
+    :type min_duration: float
+    :param stride_ms: Striding size (in milliseconds) for generating frames.
+                      Default is 10.0. 
+    :type stride_ms: float
+    :param window_ms: Window size (in milliseconds) for frames. Default is 20.0.
+    :type window_ms: float
+    :param max_frequency: Maximun frequency for FFT features. FFT features of
+                          frequency larger than this will be discarded.
+                          If set None, all features will be kept.
+                          Default is None.
+    :type max_frequency: float
+    """
+
+    def __init__(self,
+                 vocab_filepath,
+                 mean_std_filepath,
+                 augmentation_config='{}',
+                 max_duration=float('inf'),
+                 min_duration=0.0,
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 max_freq=None,
+                 random_seed=0):
+        self._max_duration = max_duration
+        self._min_duration = min_duration
+        self._normalizer = FeatureNormalizer(mean_std_filepath)
+        self._augmentation_pipeline = AugmentationPipeline(
+            augmentation_config=augmentation_config, random_seed=random_seed)
+        self._speech_featurizer = SpeechFeaturizer(
+            vocab_filepath=vocab_filepath,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            max_freq=max_freq,
+            random_seed=random_seed)
+        self._rng = random.Random(random_seed)
+        self._epoch = 0
+
+    def batch_reader_creator(self,
+                             manifest_path,
+                             batch_size,
+                             padding_to=-1,
+                             flatten=False,
+                             sortagrad=False,
+                             batch_shuffle=False):
+        """
+        Batch data reader creator for audio data. Creat a callable function to
+        produce batches of data.
+        
+        Audio features will be padded with zeros to make each instance in the
+        batch to share the same audio feature shape.
+
+        :param manifest_path: Filepath of manifest for audio clip files.
+        :type manifest_path: basestring
+        :param batch_size: Instance number in a batch.
+        :type batch_size: int
+        :param padding_to:  If set -1, the maximun column numbers in the batch
+                            will be used as the target size for padding.
+                            Otherwise, `padding_to` will be the target size.
+                            Default is -1.
+        :type padding_to: int
+        :param flatten: If set True, audio data will be flatten to be a 1-dim
+                        ndarray. Otherwise, 2-dim ndarray. Default is False.
+        :type flatten: bool
+        :param sortagrad: Sort the audio clips by duration in the first epoc
+                          if set True.
+        :type sortagrad: bool
+        :param batch_shuffle: Shuffle the audio clips if set True. It is
+                              not a thorough instance-wise shuffle, but a
+                              specific batch-wise shuffle. For more details,
+                              please see `_batch_shuffle` function.
+        :type batch_shuffle: bool
+        :return: Batch reader function, producing batches of data when called.
+        :rtype: callable
+        """
+
+        def batch_reader():
+            # read manifest
+            manifest = utils.read_manifest(
+                manifest_path=manifest_path,
+                max_duration=self._max_duration,
+                min_duration=self._min_duration)
+            # sort (by duration) or batch-wise shuffle the manifest
+            if self._epoch == 0 and sortagrad:
+                manifest.sort(key=lambda x: x["duration"])
+            elif batch_shuffle:
+                manifest = self._batch_shuffle(manifest, batch_size)
+            # prepare batches
+            instance_reader = self._instance_reader_creator(manifest)
+            batch = []
+            for instance in instance_reader():
+                batch.append(instance)
+                if len(batch) == batch_size:
+                    yield self._padding_batch(batch, padding_to, flatten)
+                    batch = []
+            if len(batch) > 0:
+                yield self._padding_batch(batch, padding_to, flatten)
+            self._epoch += 1
+
+        return batch_reader
+
+    @property
+    def feeding(self):
+        """Returns data_reader's feeding dict."""
+        return {"audio_spectrogram": 0, "transcript_text": 1}
+
+    @property
+    def vocab_size(self):
+        """Returns vocabulary size."""
+        return self._speech_featurizer.vocab_size
+
+    @property
+    def vocab_list(self):
+        """Returns vocabulary list."""
+        return self._speech_featurizer.vocab_list
+
+    def _process_utterance(self, filename, transcript):
+        speech_segment = SpeechSegment.from_file(filename, transcript)
+        self._augmentation_pipeline.transform_audio(speech_segment)
+        specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
+        specgram = self._normalizer.apply(specgram)
+        return specgram, text_ids
+
+    def _instance_reader_creator(self, manifest):
+        """
+        Instance reader creator for audio data. Creat a callable function to
+        produce instances of data.
+
+        Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
+        tokenized and indexed transcription text.
+
+        :param manifest: Filepath of manifest for audio clip files.
+        :type manifest: basestring
+        :return: Data reader function.
+        :rtype: callable
+        """
+
+        def reader():
+            for instance in manifest:
+                yield self._process_utterance(instance["audio_filepath"],
+                                              instance["text"])
+
+        return reader
+
+    def _padding_batch(self, batch, padding_to=-1, flatten=False):
+        """
+        Padding audio part of features (only in the time axis -- column axis)
+        with zeros, to make each instance in the batch share the same
+        audio feature shape.
+
+        If `padding_to` is set -1, the maximun column numbers in the batch will
+        be used as the target size. Otherwise, `padding_to` will be the target
+        size. Default is -1.
+
+        If `flatten` is set True, audio data will be flatten to be a 1-dim
+        ndarray. Default is False.
+        """
+        new_batch = []
+        # get target shape
+        max_length = max([audio.shape[1] for audio, text in batch])
+        if padding_to != -1:
+            if padding_to < max_length:
+                raise ValueError("If padding_to is not -1, it should be greater"
+                                 " or equal to the original instance length.")
+            max_length = padding_to
+        # padding
+        for audio, text in batch:
+            padded_audio = np.zeros([audio.shape[0], max_length])
+            padded_audio[:, :audio.shape[1]] = audio
+            if flatten:
+                padded_audio = padded_audio.flatten()
+            new_batch.append((padded_audio, text))
+        return new_batch
+
+    def _batch_shuffle(self, manifest, batch_size):
+        """
+        The instances have different lengths and they cannot be
+        combined into a single matrix multiplication. It usually
+        sorts the training examples by length and combines only
+        similarly-sized instances into minibatches, pads with
+        silence when necessary so that all instances in a batch
+        have the same length. This batch shuffle fuction is used
+        to make similarly-sized instances into minibatches and
+        make a batch-wise shuffle.
+
+        1. Sort the audio clips by duration.
+        2. Generate a random number `k`, k in [0, batch_size).
+        3. Randomly remove `k` instances in order to make different mini-batches,
+           then make minibatches and each minibatch size is batch_size.
+        4. Shuffle the minibatches.
+
+        :param manifest: manifest file.
+        :type manifest: list
+        :param batch_size: Batch size. This size is also used for generate
+                           a random number for batch shuffle.
+        :type batch_size: int
+        :return: batch shuffled mainifest.
+        :rtype: list
+        """
+        manifest.sort(key=lambda x: x["duration"])
+        shift_len = self._rng.randint(0, batch_size - 1)
+        batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
+        self._rng.shuffle(batch_manifest)
+        batch_manifest = list(sum(batch_manifest, ()))
+        res_len = len(manifest) - shift_len - len(batch_manifest)
+        batch_manifest.extend(manifest[-res_len:])
+        batch_manifest.extend(manifest[0:shift_len])
+        return batch_manifest
diff --git a/data_utils/featurizer/__init__.py b/data_utils/featurizer/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/data_utils/featurizer/audio_featurizer.py b/data_utils/featurizer/audio_featurizer.py
new file mode 100755
index 000000000..5d9c68836
--- /dev/null
+++ b/data_utils/featurizer/audio_featurizer.py
@@ -0,0 +1,86 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import random
+from data_utils import utils
+from data_utils.audio import AudioSegment
+
+
+class AudioFeaturizer(object):
+    def __init__(self,
+                 specgram_type='linear',
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 max_freq=None,
+                 random_seed=0):
+        self._specgram_type = specgram_type
+        self._stride_ms = stride_ms
+        self._window_ms = window_ms
+        self._max_freq = max_freq
+
+    def featurize(self, audio_segment):
+        return self._compute_specgram(audio_segment.samples,
+                                      audio_segment.sample_rate)
+
+    def _compute_specgram(self, samples, sample_rate):
+        if self._specgram_type == 'linear':
+            return self._compute_linear_specgram(
+                samples, sample_rate, self._stride_ms, self._window_ms,
+                self._max_freq)
+        else:
+            raise ValueError("Unknown specgram_type %s. "
+                             "Supported values: linear." % self._specgram_type)
+
+    def _compute_linear_specgram(self,
+                                 samples,
+                                 sample_rate,
+                                 stride_ms=10.0,
+                                 window_ms=20.0,
+                                 max_freq=None,
+                                 eps=1e-14):
+        """Laod audio data and calculate the log of spectrogram by FFT.
+        Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech
+        """
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        stride_size = int(0.001 * sample_rate * stride_ms)
+        window_size = int(0.001 * sample_rate * window_ms)
+        specgram, freqs = self._specgram_real(
+            samples,
+            window_size=window_size,
+            stride_size=stride_size,
+            sample_rate=sample_rate)
+        ind = np.where(freqs <= max_freq)[0][-1] + 1
+        return np.log(specgram[:ind, :] + eps)
+
+    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
+        """Compute the spectrogram by FFT for a discrete real signal.
+        Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech
+        """
+        # extract strided windows
+        truncate_size = (len(samples) - window_size) % stride_size
+        samples = samples[:len(samples) - truncate_size]
+        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
+        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
+        windows = np.lib.stride_tricks.as_strided(
+            samples, shape=nshape, strides=nstrides)
+        assert np.all(
+            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
+        # window weighting, squared Fast Fourier Transform (fft), scaling
+        weighting = np.hanning(window_size)[:, None]
+        fft = np.fft.rfft(windows * weighting, axis=0)
+        fft = np.absolute(fft)**2
+        scale = np.sum(weighting**2) * sample_rate
+        fft[1:-1, :] *= (2.0 / scale)
+        fft[(0, -1), :] /= scale
+        # prepare fft frequency list
+        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
+        return fft, freqs
diff --git a/data_utils/featurizer/speech_featurizer.py b/data_utils/featurizer/speech_featurizer.py
new file mode 100755
index 000000000..06af7a026
--- /dev/null
+++ b/data_utils/featurizer/speech_featurizer.py
@@ -0,0 +1,32 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from data_utils.featurizer.audio_featurizer import AudioFeaturizer
+from data_utils.featurizer.text_featurizer import TextFeaturizer
+
+
+class SpeechFeaturizer(object):
+    def __init__(self,
+                 vocab_filepath,
+                 specgram_type='linear',
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 max_freq=None,
+                 random_seed=0):
+        self._audio_featurizer = AudioFeaturizer(
+            specgram_type, stride_ms, window_ms, max_freq, random_seed)
+        self._text_featurizer = TextFeaturizer(vocab_filepath)
+
+    def featurize(self, speech_segment):
+        audio_feature = self._audio_featurizer.featurize(speech_segment)
+        text_ids = self._text_featurizer.text2ids(speech_segment.transcript)
+        return audio_feature, text_ids
+
+    @property
+    def vocab_size(self):
+        return self._text_featurizer.vocab_size
+
+    @property
+    def vocab_list(self):
+        return self._text_featurizer.vocab_list
diff --git a/data_utils/featurizer/text_featurizer.py b/data_utils/featurizer/text_featurizer.py
new file mode 100755
index 000000000..7e4b69d7b
--- /dev/null
+++ b/data_utils/featurizer/text_featurizer.py
@@ -0,0 +1,39 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+
+class TextFeaturizer(object):
+    def __init__(self, vocab_filepath):
+        self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
+            vocab_filepath)
+
+    def text2ids(self, text):
+        tokens = self._char_tokenize(text)
+        return [self._vocab_dict[token] for token in tokens]
+
+    def ids2text(self, ids):
+        return ''.join([self._vocab_list[id] for id in ids])
+
+    @property
+    def vocab_size(self):
+        return len(self._vocab_list)
+
+    @property
+    def vocab_list(self):
+        return self._vocab_list
+
+    def _char_tokenize(self, text):
+        return list(text.strip())
+
+    def _load_vocabulary_from_file(self, vocab_filepath):
+        """Load vocabulary from file."""
+        vocab_lines = []
+        with open(vocab_filepath, 'r') as file:
+            vocab_lines.extend(file.readlines())
+        vocab_list = [line[:-1] for line in vocab_lines]
+        vocab_dict = dict(
+            [(token, id) for (id, token) in enumerate(vocab_list)])
+        return vocab_dict, vocab_list
diff --git a/data_utils/normalizer.py b/data_utils/normalizer.py
new file mode 100755
index 000000000..364600af8
--- /dev/null
+++ b/data_utils/normalizer.py
@@ -0,0 +1,49 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import random
+import data_utils.utils as utils
+from data_utils.audio import AudioSegment
+
+
+class FeatureNormalizer(object):
+    def __init__(self,
+                 mean_std_filepath,
+                 manifest_path=None,
+                 featurize_func=None,
+                 num_samples=500,
+                 random_seed=0):
+        if not mean_std_filepath:
+            if not (manifest_path and featurize_func):
+                raise ValueError("If mean_std_filepath is None, meanifest_path "
+                                 "and featurize_func should not be None.")
+            self._rng = random.Random(random_seed)
+            self._compute_mean_std(manifest_path, featurize_func, num_samples)
+        else:
+            self._read_mean_std_from_file(mean_std_filepath)
+
+    def apply(self, features, eps=1e-14):
+        """Normalize features to be of zero mean and unit stddev."""
+        return (features - self._mean) / (self._std + eps)
+
+    def write_to_file(self, filepath):
+        np.savez(filepath, mean=self._mean, std=self._std)
+
+    def _read_mean_std_from_file(self, filepath):
+        npzfile = np.load(filepath)
+        self._mean = npzfile["mean"]
+        self._std = npzfile["std"]
+
+    def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
+        manifest = utils.read_manifest(manifest_path)
+        sampled_manifest = self._rng.sample(manifest, num_samples)
+        features = []
+        for instance in sampled_manifest:
+            features.append(
+                featurize_func(
+                    AudioSegment.from_file(instance["audio_filepath"])))
+        features = np.hstack(features)
+        self._mean = np.mean(features, axis=1).reshape([-1, 1])
+        self._std = np.std(features, axis=1).reshape([-1, 1])
diff --git a/data_utils/utils.py b/data_utils/utils.py
new file mode 100755
index 000000000..2a916b54f
--- /dev/null
+++ b/data_utils/utils.py
@@ -0,0 +1,19 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+
+def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
+    """Load and parse manifest file."""
+    manifest = []
+    for json_line in open(manifest_path):
+        try:
+            json_data = json.loads(json_line)
+        except Exception as e:
+            raise IOError("Error reading manifest: %s" % str(e))
+        if (json_data["duration"] <= max_duration and
+                json_data["duration"] >= min_duration):
+            manifest.append(json_data)
+    return manifest
diff --git a/data/librispeech.py b/datasets/librispeech/librispeech.py
similarity index 99%
rename from data/librispeech.py
rename to datasets/librispeech/librispeech.py
index 653caa926..1ba2a4422 100644
--- a/data/librispeech.py
+++ b/datasets/librispeech/librispeech.py
@@ -44,7 +44,7 @@ parser.add_argument(
     help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
     "--manifest_prefix",
-    default="manifest.libri",
+    default="manifest",
     type=str,
     help="Filepath prefix for output manifests. (default: %(default)s)")
 parser.add_argument(
diff --git a/datasets/run_all.sh b/datasets/run_all.sh
new file mode 100755
index 000000000..ef2b721fb
--- /dev/null
+++ b/datasets/run_all.sh
@@ -0,0 +1,13 @@
+cd librispeech
+python librispeech.py
+if [ $? -ne 0 ]; then
+    echo "Prepare LibriSpeech failed. Terminated."
+    exit 1
+fi
+cd -
+
+cat librispeech/manifest.train* | shuf > manifest.train
+cat librispeech/manifest.dev-clean > manifest.dev
+cat librispeech/manifest.test-clean > manifest.test
+
+echo "All done."
diff --git a/data/eng_vocab.txt b/datasets/vocab/eng_vocab.txt
similarity index 100%
rename from data/eng_vocab.txt
rename to datasets/vocab/eng_vocab.txt
diff --git a/infer.py b/infer.py
index 598c348b0..eb31254ce 100644
--- a/infer.py
+++ b/infer.py
@@ -2,11 +2,15 @@
    Inference for a simplifed version of Baidu DeepSpeech2 model.
 """
 
-import paddle.v2 as paddle
-import distutils.util
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import argparse
 import gzip
-from audio_data_utils import DataGenerator
+import distutils.util
+import paddle.v2 as paddle
+from data_utils.data import DataGenerator
 from model import deep_speech2
 from decoder import ctc_decode
 
@@ -38,13 +42,13 @@ parser.add_argument(
     type=distutils.util.strtobool,
     help="Use gpu or not. (default: %(default)s)")
 parser.add_argument(
-    "--normalizer_manifest_path",
-    default='data/manifest.libri.train-clean-100',
+    "--mean_std_filepath",
+    default='mean_std.npz',
     type=str,
     help="Manifest path for normalizer. (default: %(default)s)")
 parser.add_argument(
     "--decode_manifest_path",
-    default='data/manifest.libri.test-clean',
+    default='datasets/manifest.test',
     type=str,
     help="Manifest path for decoding. (default: %(default)s)")
 parser.add_argument(
@@ -54,7 +58,7 @@ parser.add_argument(
     help="Model filepath. (default: %(default)s)")
 parser.add_argument(
     "--vocab_filepath",
-    default='data/eng_vocab.txt',
+    default='datasets/vocab/eng_vocab.txt',
     type=str,
     help="Vocabulary filepath. (default: %(default)s)")
 args = parser.parse_args()
@@ -67,28 +71,22 @@ def infer():
     # initialize data generator
     data_generator = DataGenerator(
         vocab_filepath=args.vocab_filepath,
-        normalizer_manifest_path=args.normalizer_manifest_path,
-        normalizer_num_samples=200,
-        max_duration=20.0,
-        min_duration=0.0,
-        stride_ms=10,
-        window_ms=20)
+        mean_std_filepath=args.mean_std_filepath,
+        augmentation_config='{}')
 
     # create network config
-    dict_size = data_generator.vocabulary_size()
-    vocab_list = data_generator.vocabulary_list()
+    # paddle.data_type.dense_array is used for variable batch input.
+    # The size 161 * 161 is only an placeholder value and the real shape
+    # of input batch data will be induced during training.
     audio_data = paddle.layer.data(
-        name="audio_spectrogram",
-        height=161,
-        width=2000,
-        type=paddle.data_type.dense_vector(322000))
+        name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
     text_data = paddle.layer.data(
         name="transcript_text",
-        type=paddle.data_type.integer_value_sequence(dict_size))
+        type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
     output_probs = deep_speech2(
         audio_data=audio_data,
         text_data=text_data,
-        dict_size=dict_size,
+        dict_size=data_generator.vocab_size,
         num_conv_layers=args.num_conv_layers,
         num_rnn_layers=args.num_rnn_layers,
         rnn_size=args.rnn_layer_size,
@@ -99,31 +97,30 @@ def infer():
         gzip.open(args.model_filepath))
 
     # prepare infer data
-    feeding = data_generator.data_name_feeding()
-    test_batch_reader = data_generator.batch_reader_creator(
+    batch_reader = data_generator.batch_reader_creator(
         manifest_path=args.decode_manifest_path,
         batch_size=args.num_samples,
-        padding_to=2000,
-        flatten=True,
-        sort_by_duration=False,
-        shuffle=False)
-    infer_data = test_batch_reader().next()
+        sortagrad=False,
+        batch_shuffle=False)
+    infer_data = batch_reader().next()
 
     # run inference
     infer_results = paddle.infer(
         output_layer=output_probs, parameters=parameters, input=infer_data)
-    num_steps = len(infer_results) / len(infer_data)
+    num_steps = len(infer_results) // len(infer_data)
     probs_split = [
         infer_results[i * num_steps:(i + 1) * num_steps]
-        for i in xrange(0, len(infer_data))
+        for i in xrange(len(infer_data))
     ]
 
     # decode and print
     for i, probs in enumerate(probs_split):
         output_transcription = ctc_decode(
-            probs_seq=probs, vocabulary=vocab_list, method="best_path")
+            probs_seq=probs,
+            vocabulary=data_generator.vocab_list,
+            method="best_path")
         target_transcription = ''.join(
-            [vocab_list[index] for index in infer_data[i][1]])
+            [data_generator.vocab_list[index] for index in infer_data[i][1]])
         print("Target Transcription: %s \nOutput Transcription: %s \n" %
               (target_transcription, output_transcription))
 
diff --git a/train.py b/train.py
index 957c24267..c6aa97527 100644
--- a/train.py
+++ b/train.py
@@ -2,21 +2,21 @@
    Trainer for a simplifed version of Baidu DeepSpeech2 model.
 """
 
-import paddle.v2 as paddle
-import distutils.util
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import os
 import argparse
 import gzip
 import time
-import sys
+import distutils.util
+import paddle.v2 as paddle
 from model import deep_speech2
-from audio_data_utils import DataGenerator
-import numpy as np
-import os
+from data_utils.data import DataGenerator
 
-#TODO: add WER metric
-
-parser = argparse.ArgumentParser(
-    description='Simplified version of DeepSpeech2 trainer.')
+parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
     "--batch_size", default=32, type=int, help="Minibatch size.")
 parser.add_argument(
@@ -51,7 +51,7 @@ parser.add_argument(
     help="Use gpu or not. (default: %(default)s)")
 parser.add_argument(
     "--use_sortagrad",
-    default=False,
+    default=True,
     type=distutils.util.strtobool,
     help="Use sortagrad or not. (default: %(default)s)")
 parser.add_argument(
@@ -60,23 +60,23 @@ parser.add_argument(
     type=int,
     help="Trainer number. (default: %(default)s)")
 parser.add_argument(
-    "--normalizer_manifest_path",
-    default='data/manifest.libri.train-clean-100',
+    "--mean_std_filepath",
+    default='mean_std.npz',
     type=str,
     help="Manifest path for normalizer. (default: %(default)s)")
 parser.add_argument(
     "--train_manifest_path",
-    default='data/manifest.libri.train-clean-100',
+    default='datasets/manifest.train',
     type=str,
     help="Manifest path for training. (default: %(default)s)")
 parser.add_argument(
     "--dev_manifest_path",
-    default='data/manifest.libri.dev-clean',
+    default='datasets/manifest.dev',
     type=str,
     help="Manifest path for validation. (default: %(default)s)")
 parser.add_argument(
     "--vocab_filepath",
-    default='data/eng_vocab.txt',
+    default='datasets/vocab/eng_vocab.txt',
     type=str,
     help="Vocabulary filepath. (default: %(default)s)")
 parser.add_argument(
@@ -86,6 +86,12 @@ parser.add_argument(
     help="If set None, the training will start from scratch. "
     "Otherwise, the training will resume from "
     "the existing model of this path. (default: %(default)s)")
+parser.add_argument(
+    "--augmentation_config",
+    default='{}',
+    type=str,
+    help="Augmentation configuration in json-format. "
+    "(default: %(default)s)")
 args = parser.parse_args()
 
 
@@ -98,29 +104,26 @@ def train():
     def data_generator():
         return DataGenerator(
             vocab_filepath=args.vocab_filepath,
-            normalizer_manifest_path=args.normalizer_manifest_path,
-            normalizer_num_samples=200,
-            max_duration=20.0,
-            min_duration=0.0,
-            stride_ms=10,
-            window_ms=20)
+            mean_std_filepath=args.mean_std_filepath,
+            augmentation_config=args.augmentation_config)
 
     train_generator = data_generator()
     test_generator = data_generator()
+
     # create network config
-    dict_size = train_generator.vocabulary_size()
     # paddle.data_type.dense_array is used for variable batch input.
-    # the size 161 * 161 is only an placeholder value and the real shape
-    # of input batch data will be set at each batch.
+    # The size 161 * 161 is only an placeholder value and the real shape
+    # of input batch data will be induced during training.
     audio_data = paddle.layer.data(
         name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
     text_data = paddle.layer.data(
         name="transcript_text",
-        type=paddle.data_type.integer_value_sequence(dict_size))
+        type=paddle.data_type.integer_value_sequence(
+            train_generator.vocab_size))
     cost = deep_speech2(
         audio_data=audio_data,
         text_data=text_data,
-        dict_size=dict_size,
+        dict_size=train_generator.vocab_size,
         num_conv_layers=args.num_conv_layers,
         num_rnn_layers=args.num_rnn_layers,
         rnn_size=args.rnn_layer_size,
@@ -143,13 +146,13 @@ def train():
     train_batch_reader = train_generator.batch_reader_creator(
         manifest_path=args.train_manifest_path,
         batch_size=args.batch_size,
-        sortagrad=True if args.init_model_path is None else False,
+        sortagrad=args.use_sortagrad if args.init_model_path is None else False,
         batch_shuffle=True)
     test_batch_reader = test_generator.batch_reader_creator(
         manifest_path=args.dev_manifest_path,
         batch_size=args.batch_size,
+        sortagrad=False,
         batch_shuffle=False)
-    feeding = train_generator.data_name_feeding()
 
     # create event handler
     def event_handler(event):
@@ -158,8 +161,8 @@ def train():
             cost_sum += event.cost
             cost_counter += 1
             if event.batch_id % 50 == 0:
-                print "\nPass: %d, Batch: %d, TrainCost: %f" % (
-                    event.pass_id, event.batch_id, cost_sum / cost_counter)
+                print("\nPass: %d, Batch: %d, TrainCost: %f" %
+                      (event.pass_id, event.batch_id, cost_sum / cost_counter))
                 cost_sum, cost_counter = 0.0, 0
                 with gzip.open("params.tar.gz", 'w') as f:
                     parameters.to_tar(f)
@@ -170,16 +173,17 @@ def train():
             start_time = time.time()
             cost_sum, cost_counter = 0.0, 0
         if isinstance(event, paddle.event.EndPass):
-            result = trainer.test(reader=test_batch_reader, feeding=feeding)
-            print "\n------- Time: %d sec,  Pass: %d, ValidationCost: %s" % (
-                time.time() - start_time, event.pass_id, result.cost)
+            result = trainer.test(
+                reader=test_batch_reader, feeding=test_generator.feeding)
+            print("\n------- Time: %d sec,  Pass: %d, ValidationCost: %s" %
+                  (time.time() - start_time, event.pass_id, result.cost))
 
     # run train
     trainer.train(
         reader=train_batch_reader,
         event_handler=event_handler,
         num_passes=args.num_passes,
-        feeding=feeding)
+        feeding=train_generator.feeding)
 
 
 def main():