From b3ebf3fd620324b83859c66dd07f42c91a37ec07 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Thu, 9 Nov 2017 22:47:08 +0800
Subject: [PATCH 1/2] Support padding removing.

---
 data_utils/data.py     | 37 ++++++++++++++++++++++++++++++++++---
 infer.py               |  8 +++++---
 model_utils/model.py   | 30 ++++++++++++++++++++++++++----
 model_utils/network.py | 42 +++++++++++++++++++++++++++++++++++-------
 test.py                |  8 +++++---
 train.py               |  6 ++++--
 6 files changed, 109 insertions(+), 22 deletions(-)

diff --git a/data_utils/data.py b/data_utils/data.py
index 70ee6fba..1469beb0 100644
--- a/data_utils/data.py
+++ b/data_utils/data.py
@@ -59,6 +59,9 @@ class DataGenerator(object):
                                     be passed forward directly without
                                     converting to index sequence.
     :type keep_transcription_text: bool
+    :param num_conv_layers: The number of convolution layer, used to compute
+                            the sequence length.
+    :type num_conv_layers: int
     """
 
     def __init__(self,
@@ -74,7 +77,8 @@ class DataGenerator(object):
                  use_dB_normalization=True,
                  num_threads=multiprocessing.cpu_count() // 2,
                  random_seed=0,
-                 keep_transcription_text=False):
+                 keep_transcription_text=False,
+                 num_conv_layers=2):
         self._max_duration = max_duration
         self._min_duration = min_duration
         self._normalizer = FeatureNormalizer(mean_std_filepath)
@@ -95,6 +99,7 @@ class DataGenerator(object):
         self._local_data = local()
         self._local_data.tar2info = {}
         self._local_data.tar2object = {}
+        self._num_conv_layers = num_conv_layers
 
     def process_utterance(self, filename, transcript):
         """Load, augment, featurize and normalize for speech data.
@@ -213,7 +218,15 @@ class DataGenerator(object):
         :return: Data feeding dict.
         :rtype: dict
         """
-        return {"audio_spectrogram": 0, "transcript_text": 1}
+        feeding_dict = {
+            "audio_spectrogram": 0,
+            "transcript_text": 1,
+            "sequence_offset": 2,
+            "sequence_length": 3
+        }
+        for i in xrange(self._num_conv_layers):
+            feeding_dict["conv%d_index_range" % i] = len(feeding_dict)
+        return feeding_dict
 
     @property
     def vocab_size(self):
@@ -306,7 +319,25 @@ class DataGenerator(object):
             padded_audio[:, :audio.shape[1]] = audio
             if flatten:
                 padded_audio = padded_audio.flatten()
-            new_batch.append((padded_audio, text))
+
+            padded_instance = [padded_audio, text]
+            padded_conv0_h = (padded_audio.shape[0] - 1) // 2 + 1
+            padded_conv0_w = (padded_audio.shape[1] - 1) // 3 + 1
+            valid_w = (audio.shape[1] - 1) // 3 + 1
+            padded_instance += [
+                [0],  # sequence offset, always 0
+                [valid_w],  # valid sequence length
+                [1, 32, 1, padded_conv0_h, valid_w + 1, padded_conv0_w]
+            ]
+            pre_padded_h = padded_conv0_h
+            for i in xrange(self._num_conv_layers - 1):
+                padded_h = (pre_padded_h - 1) // 2 + 1
+                pre_padded_h = padded_h
+                padded_instance += [
+                    [1, 32, 1, padded_h, valid_w + 1, padded_conv0_w]
+                ]
+
+            new_batch.append(padded_instance)
         return new_batch
 
     def _batch_shuffle(self, manifest, batch_size, clipped=False):
diff --git a/infer.py b/infer.py
index 9ac3e632..32d15f12 100644
--- a/infer.py
+++ b/infer.py
@@ -69,7 +69,8 @@ def infer():
         augmentation_config='{}',
         specgram_type=args.specgram_type,
         num_threads=1,
-        keep_transcription_text=True)
+        keep_transcription_text=True,
+        num_conv_layers=args.num_conv_layers)
     batch_reader = data_generator.batch_reader_creator(
         manifest_path=args.infer_manifest,
         batch_size=args.num_samples,
@@ -100,10 +101,11 @@ def infer():
         cutoff_top_n=args.cutoff_top_n,
         vocab_list=vocab_list,
         language_model_path=args.lang_model_path,
-        num_processes=args.num_proc_bsearch)
+        num_processes=args.num_proc_bsearch,
+        feeding_dict=data_generator.feeding)
 
     error_rate_func = cer if args.error_rate_type == 'cer' else wer
-    target_transcripts = [transcript for _, transcript in infer_data]
+    target_transcripts = [data[1] for data in infer_data]
     for target, result in zip(target_transcripts, result_transcripts):
         print("\nTarget Transcription: %s\nOutput Transcription: %s" %
               (target, result))
diff --git a/model_utils/model.py b/model_utils/model.py
index 5a0d8890..26aa1470 100644
--- a/model_utils/model.py
+++ b/model_utils/model.py
@@ -165,7 +165,7 @@ class DeepSpeech2Model(object):
 
     def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta,
                     beam_size, cutoff_prob, cutoff_top_n, vocab_list,
-                    language_model_path, num_processes):
+                    language_model_path, num_processes, feeding_dict):
         """Model inference. Infer the transcription for a batch of speech
         utterances.
 
@@ -195,6 +195,9 @@ class DeepSpeech2Model(object):
         :type language_model_path: basestring|None
         :param num_processes: Number of processes (CPU) for decoder.
         :type num_processes: int
+        :param feeding_dict: Feeding is a map of field name and tuple index
+                             of the data that reader returns.
+        :type feeding_dict: dict|list
         :return: List of transcription texts.
         :rtype: List of basestring
         """
@@ -203,10 +206,13 @@ class DeepSpeech2Model(object):
             self._inferer = paddle.inference.Inference(
                 output_layer=self._log_probs, parameters=self._parameters)
         # run inference
-        infer_results = self._inferer.infer(input=infer_data)
-        num_steps = len(infer_results) // len(infer_data)
+        infer_results = self._inferer.infer(
+            input=infer_data, feeding=feeding_dict)
+        start_pos = [0] * (len(infer_data) + 1)
+        for i in xrange(len(infer_data)):
+            start_pos[i + 1] = start_pos[i] + infer_data[i][3][0]
         probs_split = [
-            infer_results[i * num_steps:(i + 1) * num_steps]
+            infer_results[start_pos[i]:start_pos[i + 1]]
             for i in xrange(0, len(infer_data))
         ]
         # run decoder
@@ -274,9 +280,25 @@ class DeepSpeech2Model(object):
         text_data = paddle.layer.data(
             name="transcript_text",
             type=paddle.data_type.integer_value_sequence(vocab_size))
+        seq_offset_data = paddle.layer.data(
+            name='sequence_offset',
+            type=paddle.data_type.integer_value_sequence(1))
+        seq_len_data = paddle.layer.data(
+            name='sequence_length',
+            type=paddle.data_type.integer_value_sequence(1))
+        index_range_datas = []
+        for i in xrange(num_rnn_layers):
+            index_range_datas.append(
+                paddle.layer.data(
+                    name='conv%d_index_range' % i,
+                    type=paddle.data_type.dense_vector(6)))
+
         self._log_probs, self._loss = deep_speech_v2_network(
             audio_data=audio_data,
             text_data=text_data,
+            seq_offset_data=seq_offset_data,
+            seq_len_data=seq_len_data,
+            index_range_datas=index_range_datas,
             dict_size=vocab_size,
             num_conv_layers=num_conv_layers,
             num_rnn_layers=num_rnn_layers,
diff --git a/model_utils/network.py b/model_utils/network.py
index 13ba5d2c..2053e906 100644
--- a/model_utils/network.py
+++ b/model_utils/network.py
@@ -7,7 +7,7 @@ import paddle.v2 as paddle
 
 
 def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
-                  padding, act):
+                  padding, act, index_range_data):
     """Convolution layer with batch normalization.
 
     :param input: Input layer.
@@ -24,6 +24,8 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
     :type padding: int|tuple|list
     :param act: Activation type.
     :type act: BaseActivation
+    :param index_range_data: Index range to indicate sub region.
+    :type index_range_data: LayerOutput
     :return: Batch norm layer after convolution layer.
     :rtype: LayerOutput
     """
@@ -36,7 +38,11 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
         padding=padding,
         act=paddle.activation.Linear(),
         bias_attr=False)
-    return paddle.layer.batch_norm(input=conv_layer, act=act)
+    batch_norm = paddle.layer.batch_norm(input=conv_layer, act=act)
+    # reset padding part to 0
+    scale_sub_region = paddle.layer.scale_sub_region(
+        batch_norm, index_range_data, value=0.0)
+    return scale_sub_region
 
 
 def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
@@ -136,13 +142,15 @@ def bidirectional_gru_bn_layer(name, input, size, act):
     return paddle.layer.concat(input=[forward_gru, backward_gru])
 
 
-def conv_group(input, num_stacks):
+def conv_group(input, num_stacks, index_range_datas):
     """Convolution group with stacked convolution layers.
 
     :param input: Input layer.
     :type input: LayerOutput
     :param num_stacks: Number of stacked convolution layers.
     :type num_stacks: int
+    :param index_range_datas: Index ranges for each convolution layer.
+    :type index_range_datas: tuple|list
     :return: Output layer of the convolution group.
     :rtype: LayerOutput
     """
@@ -153,7 +161,8 @@ def conv_group(input, num_stacks):
         num_channels_out=32,
         stride=(3, 2),
         padding=(5, 20),
-        act=paddle.activation.BRelu())
+        act=paddle.activation.BRelu(),
+        index_range_data=index_range_datas[0])
     for i in xrange(num_stacks - 1):
         conv = conv_bn_layer(
             input=conv,
@@ -162,7 +171,8 @@ def conv_group(input, num_stacks):
             num_channels_out=32,
             stride=(1, 2),
             padding=(5, 10),
-            act=paddle.activation.BRelu())
+            act=paddle.activation.BRelu(),
+            index_range_data=index_range_datas[i + 1])
     output_num_channels = 32
     output_height = 160 // pow(2, num_stacks) + 1
     return conv, output_num_channels, output_height
@@ -207,6 +217,9 @@ def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
 
 def deep_speech_v2_network(audio_data,
                            text_data,
+                           seq_offset_data,
+                           seq_len_data,
+                           index_range_datas,
                            dict_size,
                            num_conv_layers=2,
                            num_rnn_layers=3,
@@ -219,6 +232,12 @@ def deep_speech_v2_network(audio_data,
     :type audio_data: LayerOutput
     :param text_data: Transcription text data layer.
     :type text_data: LayerOutput
+    :param seq_offset_data: Sequence offset data layer.
+    :type seq_offset_data: LayerOutput
+    :param seq_len_data: Valid sequence length data layer.
+    :type seq_len_data: LayerOutput
+    :param index_range_datas: Index ranges data layers.
+    :type index_range_datas: tuple|list
     :param dict_size: Dictionary size for tokenized transcription.
     :type dict_size: int
     :param num_conv_layers: Number of stacking convolution layers.
@@ -239,7 +258,9 @@ def deep_speech_v2_network(audio_data,
     """
     # convolution group
     conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
-        input=audio_data, num_stacks=num_conv_layers)
+        input=audio_data,
+        num_stacks=num_conv_layers,
+        index_range_datas=index_range_datas)
     # convert data form convolution feature map to sequence of vectors
     conv2seq = paddle.layer.block_expand(
         input=conv_group_output,
@@ -248,9 +269,16 @@ def deep_speech_v2_network(audio_data,
         stride_y=1,
         block_x=1,
         block_y=conv_group_height)
+    # remove padding part
+    remove_padding = paddle.layer.sub_seq(
+        input=conv2seq,
+        offsets=seq_offset_data,
+        sizes=seq_len_data,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
     # rnn group
     rnn_group_output = rnn_group(
-        input=conv2seq,
+        input=remove_padding,
         size=rnn_size,
         num_stacks=num_rnn_layers,
         use_gru=use_gru,
diff --git a/test.py b/test.py
index 63fc4f65..53f7e17a 100644
--- a/test.py
+++ b/test.py
@@ -70,7 +70,8 @@ def evaluate():
         augmentation_config='{}',
         specgram_type=args.specgram_type,
         num_threads=args.num_proc_data,
-        keep_transcription_text=True)
+        keep_transcription_text=True,
+        num_conv_layers=args.num_conv_layers)
     batch_reader = data_generator.batch_reader_creator(
         manifest_path=args.test_manifest,
         batch_size=args.batch_size,
@@ -103,8 +104,9 @@ def evaluate():
             cutoff_top_n=args.cutoff_top_n,
             vocab_list=vocab_list,
             language_model_path=args.lang_model_path,
-            num_processes=args.num_proc_bsearch)
-        target_transcripts = [transcript for _, transcript in infer_data]
+            num_processes=args.num_proc_bsearch,
+            feeding_dict=data_generator.feeding)
+        target_transcripts = [data[1] for data in infer_data]
         for target, result in zip(target_transcripts, result_transcripts):
             error_sum += error_rate_func(target, result)
             num_ins += 1
diff --git a/train.py b/train.py
index 16415713..562fb462 100644
--- a/train.py
+++ b/train.py
@@ -75,13 +75,15 @@ def train():
         max_duration=args.max_duration,
         min_duration=args.min_duration,
         specgram_type=args.specgram_type,
-        num_threads=args.num_proc_data)
+        num_threads=args.num_proc_data,
+        num_conv_layers=args.num_conv_layers)
     dev_generator = DataGenerator(
         vocab_filepath=args.vocab_path,
         mean_std_filepath=args.mean_std_path,
         augmentation_config="{}",
         specgram_type=args.specgram_type,
-        num_threads=args.num_proc_data)
+        num_threads=args.num_proc_data,
+        num_conv_layers=args.num_conv_layers)
     train_batch_reader = train_generator.batch_reader_creator(
         manifest_path=args.train_manifest,
         batch_size=args.batch_size,

From f38d948193a1fb6ef967e2036e5c7cbceabaec16 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Fri, 10 Nov 2017 14:43:05 +0800
Subject: [PATCH 2/2] Add more comments.

---
 data_utils/data.py     | 5 +++++
 model_utils/network.py | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/data_utils/data.py b/data_utils/data.py
index 1469beb0..d913e48a 100644
--- a/data_utils/data.py
+++ b/data_utils/data.py
@@ -320,6 +320,9 @@ class DataGenerator(object):
             if flatten:
                 padded_audio = padded_audio.flatten()
 
+            # Stride size for conv0 is (3, 2)
+            # Stride size for conv1 to convN is (1, 2)
+            # Same as the network, hard-coded here
             padded_instance = [padded_audio, text]
             padded_conv0_h = (padded_audio.shape[0] - 1) // 2 + 1
             padded_conv0_w = (padded_audio.shape[1] - 1) // 3 + 1
@@ -327,6 +330,8 @@ class DataGenerator(object):
             padded_instance += [
                 [0],  # sequence offset, always 0
                 [valid_w],  # valid sequence length
+                # Index ranges for channel, height and width
+                # Please refer scale_sub_region layer to see details
                 [1, 32, 1, padded_conv0_h, valid_w + 1, padded_conv0_w]
             ]
             pre_padded_h = padded_conv0_h
diff --git a/model_utils/network.py b/model_utils/network.py
index 2053e906..7b4b8ab2 100644
--- a/model_utils/network.py
+++ b/model_utils/network.py
@@ -270,7 +270,7 @@ def deep_speech_v2_network(audio_data,
         block_x=1,
         block_y=conv_group_height)
     # remove padding part
-    remove_padding = paddle.layer.sub_seq(
+    remove_padding_data = paddle.layer.sub_seq(
         input=conv2seq,
         offsets=seq_offset_data,
         sizes=seq_len_data,
@@ -278,7 +278,7 @@ def deep_speech_v2_network(audio_data,
         bias_attr=False)
     # rnn group
     rnn_group_output = rnn_group(
-        input=remove_padding,
+        input=remove_padding_data,
         size=rnn_size,
         num_stacks=num_rnn_layers,
         use_gru=use_gru,