From f38d948193a1fb6ef967e2036e5c7cbceabaec16 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Fri, 10 Nov 2017 14:43:05 +0800 Subject: [PATCH] Add more comments. --- data_utils/data.py | 5 +++++ model_utils/network.py | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/data_utils/data.py b/data_utils/data.py index 1469beb0..d913e48a 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -320,6 +320,9 @@ class DataGenerator(object): if flatten: padded_audio = padded_audio.flatten() + # Stride size for conv0 is (3, 2) + # Stride size for conv1 to convN is (1, 2) + # Same as the network, hard-coded here padded_instance = [padded_audio, text] padded_conv0_h = (padded_audio.shape[0] - 1) // 2 + 1 padded_conv0_w = (padded_audio.shape[1] - 1) // 3 + 1 @@ -327,6 +330,8 @@ class DataGenerator(object): padded_instance += [ [0], # sequence offset, always 0 [valid_w], # valid sequence length + # Index ranges for channel, height and width + # Please refer scale_sub_region layer to see details [1, 32, 1, padded_conv0_h, valid_w + 1, padded_conv0_w] ] pre_padded_h = padded_conv0_h diff --git a/model_utils/network.py b/model_utils/network.py index 2053e906..7b4b8ab2 100644 --- a/model_utils/network.py +++ b/model_utils/network.py @@ -270,7 +270,7 @@ def deep_speech_v2_network(audio_data, block_x=1, block_y=conv_group_height) # remove padding part - remove_padding = paddle.layer.sub_seq( + remove_padding_data = paddle.layer.sub_seq( input=conv2seq, offsets=seq_offset_data, sizes=seq_len_data, @@ -278,7 +278,7 @@ def deep_speech_v2_network(audio_data, bias_attr=False) # rnn group rnn_group_output = rnn_group( - input=remove_padding, + input=remove_padding_data, size=rnn_size, num_stacks=num_rnn_layers, use_gru=use_gru,