diff --git a/data_utils/data.py b/data_utils/data.py index d01ca8cc..e1fa4747 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -8,13 +8,20 @@ from __future__ import print_function import random import numpy as np import multiprocessing +from threading import local import paddle.v2 as paddle +import tarfile from data_utils import utils from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.featurizer.speech_featurizer import SpeechFeaturizer from data_utils.speech import SpeechSegment from data_utils.normalizer import FeatureNormalizer +# for caching tar files info +local_data = local() +local_data.tar2info = {} +local_data.tar2object = {} + class DataGenerator(object): """ @@ -45,9 +52,6 @@ class DataGenerator(object): :types max_freq: None|float :param specgram_type: Specgram feature type. Options: 'linear'. :type specgram_type: str - :param use_dB_normalization: Whether to normalize the audio to -20 dB - before extracting the features. - :type use_dB_normalization: bool :param num_threads: Number of CPU threads for processing data. :type num_threads: int :param random_seed: Random seed. @@ -64,7 +68,6 @@ class DataGenerator(object): window_ms=20.0, max_freq=None, specgram_type='linear', - use_dB_normalization=True, num_threads=multiprocessing.cpu_count(), random_seed=0): self._max_duration = max_duration @@ -77,12 +80,15 @@ class DataGenerator(object): specgram_type=specgram_type, stride_ms=stride_ms, window_ms=window_ms, - max_freq=max_freq, - use_dB_normalization=use_dB_normalization) + max_freq=max_freq) self._num_threads = num_threads self._rng = random.Random(random_seed) self._epoch = 0 + # for caching tar files info + self.tar2info = {} + self.tar2object = {} + def batch_reader_creator(self, manifest_path, batch_size, @@ -94,7 +100,7 @@ class DataGenerator(object): """ Batch data reader creator for audio data. Return a callable generator function to produce batches of data. - + Audio features within one batch will be padded with zeros to have the same shape, or a user-defined shape. @@ -174,9 +180,9 @@ class DataGenerator(object): @property def feeding(self): """Returns data reader's feeding dict. - + :return: Data feeding dict. - :rtype: dict + :rtype: dict """ return {"audio_spectrogram": 0, "transcript_text": 1} @@ -198,9 +204,41 @@ class DataGenerator(object): """ return self._speech_featurizer.vocab_list + def _parse_tar(self, file): + """ + Parse a tar file to get a tarfile object and a map containing tarinfoes + """ + result = {} + f = tarfile.open(file) + for tarinfo in f.getmembers(): + result[tarinfo.name] = tarinfo + return f, result + + def _read_soundbytes(self, filepath): + """ + Read bytes from file. + If filepath startwith tar, we will read bytes from tar file + and cached tar file info for next reading request. + """ + if filepath.startswith('tar:'): + tarpath, filename = filepath.split(':', 1)[1].split('#', 1) + if 'tar2info' not in local_data.__dict__: + local_data.tar2info = {} + if 'tar2object' not in local_data.__dict__: + local_data.tar2object = {} + if tarpath not in local_data.tar2info: + object, infoes = self._parse_tar(tarpath) + local_data.tar2info[tarpath] = infoes + local_data.tar2object[tarpath] = object + return local_data.tar2object[tarpath].extractfile( + local_data.tar2info[tarpath][filename]).read() + else: + return open(filepath).read() + def _process_utterance(self, filename, transcript): """Load, augment, featurize and normalize for speech data.""" - speech_segment = SpeechSegment.from_file(filename, transcript) + speech_segment = SpeechSegment.from_bytes( + self._read_soundbytes(filename), transcript) self._augmentation_pipeline.transform_audio(speech_segment) specgram, text_ids = self._speech_featurizer.featurize(speech_segment) specgram = self._normalizer.apply(specgram) diff --git a/datasets/librispeech/pcloud_data.py b/datasets/librispeech/pcloud_data.py new file mode 100644 index 00000000..91400114 --- /dev/null +++ b/datasets/librispeech/pcloud_data.py @@ -0,0 +1,51 @@ +import json +import os +import tarfile +import sys +import argparse + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--manifest_path", + default="/manifest.train", + type=str, + help="Manifest of target data. (default: %(default)s)") +parser.add_argument( + "--out_tar_path", + default="/dev.tar", + type=str, + help="Output tar file path. (default: %(default)s)") +parser.add_argument( + "--out_manifest_path", + default="/dev.mani", + type=str, + help="Manifest of output data. (default: %(default)s)") +args = parser.parse_args() + + +def gen_pcloud_data(manifest_path, out_tar_path, out_manifest_path): + ''' + 1. According manifest, tar sound files into out_tar_path + 2. Generate a new manifest for output tar file + ''' + out_tar = tarfile.open(out_tar_path, 'w') + manifest = [] + for json_line in open(manifest_path): + try: + json_data = json.loads(json_line) + except Exception as e: + raise IOError("Error reading manifest: %s" % str(e)) + sound_file = json_data['audio_filepath'] + filename = os.path.basename(sound_file) + out_tar.add(sound_file, arcname=filename) + json_data['audio_filepath'] = filename + manifest.append("%s\n" % json.dumps(json_data)) + with open(out_manifest_path, 'w') as out_manifest: + out_manifest.writelines(manifest) + out_manifest.close() + out_tar.close() + + +if __name__ == '__main__': + gen_pcloud_data(args.manifest_path, args.out_tar_path, + args.out_manifest_path) diff --git a/pcloud_split_data.py b/pcloud_split_data.py new file mode 100644 index 00000000..bf35383a --- /dev/null +++ b/pcloud_split_data.py @@ -0,0 +1,47 @@ +import os +import json +import argparse + + +def split_data(inManifest, tar_path, outManifest): + trainer_id = 1 + trainer_count = 2 + #with open("/trainer_id", "r") as f: + # trainer_id = int(f.readline()[:-1]) + #with open("/trainer_count", "r") as f: + # trainer_count = int(f.readline()[:-1]) + + tarPath = os.path.abspath(tar_path) + result = [] + for index, json_line in enumerate(open(inManifest)): + if (index % trainer_count) == trainer_id: + json_data = json.loads(json_line) + json_data['audio_filepath'] = "tar:%s#%s" % ( + tarPath, json_data['audio_filepath']) + result.append("%s\n" % json.dumps(json_data)) + with open(outManifest, 'w') as manifest: + manifest.writelines(result) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + + parser.add_argument( + "--in_manifest_path", + default='datasets/dev.mani', + type=str, + help="Input manifest path. (default: %(default)s)") + parser.add_argument( + "--data_tar_path", + default='datasets/dev.tar', + type=str, + help="Data tar file path. (default: %(default)s)") + parser.add_argument( + "--out_manifest_path", + default='datasets/dev.mani.split', + type=str, + help="Out manifest file path. (default: %(default)s)") + args = parser.parse_args() + + split_data(args.in_manifest_path, args.data_tar_path, + args.out_manifest_path) diff --git a/pcloud_submit.sh b/pcloud_submit.sh new file mode 100644 index 00000000..06e65110 --- /dev/null +++ b/pcloud_submit.sh @@ -0,0 +1,13 @@ +paddlecloud submit \ +-image wanghaoshuang/pcloud_ds2 \ +-jobname ds23 \ +-cpu 1 \ +-gpu 0 \ +-memory 10Gi \ +-parallelism 1 \ +-pscpu 1 \ +-pservers 1 \ +-psmemory 10Gi \ +-passes 1 \ +-entry "sh pcloud_train.sh" \ +./deep_speech_2 diff --git a/pcloud_train.sh b/pcloud_train.sh new file mode 100644 index 00000000..fb6cbb9e --- /dev/null +++ b/pcloud_train.sh @@ -0,0 +1,32 @@ +#setted by user +TRAIN_MANI='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.mani' +#setted by user +DEV_MANI='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.mani' +#setted by user +TRAIN_TAR='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.tar' +#setted by user +DEV_TAR='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.tar' +#setted by user +VOCAB_PATH='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/eng_vocab.txt' +#setted by user +MEAN_STD_FILE='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/mean_std.npz' + +# split train data for each pcloud node +python pcloud_split_data.py \ +--in_manifest_path=$TRAIN_MANI \ +--data_tar_path=$TRAIN_TAR \ +--out_manifest_path='./train.mani' +# split dev data for each pcloud node +python pcloud_split_data.py \ +--in_manifest_path=$DEV_MANI \ +--data_tar_path=$DEV_TAR \ +--out_manifest_path='./dev.mani' + +python train.py \ +--use_gpu=0 \ +--trainer_count=4 \ +--batch_size=2 \ +--mean_std_filepath=$MEAN_STD_FILE \ +--train_manifest_path='./train.mani' \ +--dev_manifest_path='./dev.mani' \ +--vocab_filepath=$VOCAB_PATH \