fix data

5 years ago · 0f11e93e7f
parent c329c5dea1
commit 0f11e93e7f
2 changed files with 31 additions and 23 deletions
--- a/data_utils/data.py
+++ b/data_utils/data.py
@ -27,8 +27,10 @@ from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
 from data_utils.speech import SpeechSegment
 from data_utils.normalizer import FeatureNormalizer

+__all__ = ['DataGenerator']

-class DataGenerator(object):
+
+class DataGenerator():
    """
    DataGenerator provides basic audio data preprocessing pipeline, and offers
    data reader interfaces of PaddlePaddle requirements.
@ -310,43 +312,32 @@ class DataGenerator(object):
                raise ValueError("If padding_to is not -1, it should be larger "
                                 "than any instance's shape in the batch")
            max_length = padding_to
+        max_text_length = max([len(text) for audio, text in batch])
        # padding
        padded_audios = []
-        texts, text_lens = [], []
        audio_lens = []
-        masks = []
+        texts, text_lens = [], []
        for audio, text in batch:
            padded_audio = np.zeros([audio.shape[0], max_length])
            padded_audio[:, :audio.shape[1]] = audio
            if flatten:
                padded_audio = padded_audio.flatten()
            padded_audios.append(padded_audio)
+            audio_lens.append(audio.shape[1])
            if self._is_training:
-                texts += text
+                padded_text = np.zeros([max_text_length])
+                padded_text[:len(text)] = text
+                texts.append(padded_text)
            else:
                texts.append(text)
            text_lens.append(len(text))
-            audio_lens.append(audio.shape[1])
-            mask_shape0 = (audio.shape[0] - 1) // 2 + 1
-            mask_shape1 = (audio.shape[1] - 1) // 3 + 1
-            mask_max_len = (max_length - 1) // 3 + 1
-            mask_ones = np.ones((mask_shape0, mask_shape1))
-            mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1))
-            mask = np.repeat(
-                np.reshape(
-                    np.concatenate((mask_ones, mask_zeros), axis=1),
-                    (1, mask_shape0, mask_max_len)),
-                32,
-                axis=0)
-            masks.append(mask)
+
        padded_audios = np.array(padded_audios).astype('float32')
+        audio_lens = np.array(audio_lens).astype('int64')
        if self._is_training:
-            texts = np.expand_dims(np.array(texts).astype('int32'), axis=-1)
-            texts = fluid.create_lod_tensor(
-                texts, recursive_seq_lens=[text_lens], place=self._place)
-        audio_lens = np.array(audio_lens).astype('int64').reshape([-1, 1])
-        masks = np.array(masks).astype('float32')
-        return padded_audios, texts, audio_lens, masks
+            texts = np.array(texts).astype('int32')
+            text_lens = np.array(text_lens).astype('int64')
+        return padded_audios, texts, audio_lens, text_lens

    def _batch_shuffle(self, manifest, batch_size, clipped=False):
        """Put similarly-sized instances into minibatches for better efficiency
--- a/data_utils/dataset.py
+++ b/data_utils/dataset.py
@ -0,0 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.io import Dataset
+from paddle.io import DataLoader