1. Refine data_utils/data.py to read bytes from tar file 2. Add scripts to submit paddle cloud job for ds2 trainningpull/2/head
parent
db37c34919
commit
3c77d369ca
@ -0,0 +1,51 @@
|
||||
import json
|
||||
import os
|
||||
import tarfile
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--manifest_path",
|
||||
default="/manifest.train",
|
||||
type=str,
|
||||
help="Manifest of target data. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--out_tar_path",
|
||||
default="/dev.tar",
|
||||
type=str,
|
||||
help="Output tar file path. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--out_manifest_path",
|
||||
default="/dev.mani",
|
||||
type=str,
|
||||
help="Manifest of output data. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def gen_pcloud_data(manifest_path, out_tar_path, out_manifest_path):
|
||||
'''
|
||||
1. According manifest, tar sound files into out_tar_path
|
||||
2. Generate a new manifest for output tar file
|
||||
'''
|
||||
out_tar = tarfile.open(out_tar_path, 'w')
|
||||
manifest = []
|
||||
for json_line in open(manifest_path):
|
||||
try:
|
||||
json_data = json.loads(json_line)
|
||||
except Exception as e:
|
||||
raise IOError("Error reading manifest: %s" % str(e))
|
||||
sound_file = json_data['audio_filepath']
|
||||
filename = os.path.basename(sound_file)
|
||||
out_tar.add(sound_file, arcname=filename)
|
||||
json_data['audio_filepath'] = filename
|
||||
manifest.append("%s\n" % json.dumps(json_data))
|
||||
with open(out_manifest_path, 'w') as out_manifest:
|
||||
out_manifest.writelines(manifest)
|
||||
out_manifest.close()
|
||||
out_tar.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
gen_pcloud_data(args.manifest_path, args.out_tar_path,
|
||||
args.out_manifest_path)
|
@ -0,0 +1,47 @@
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
|
||||
|
||||
def split_data(inManifest, tar_path, outManifest):
|
||||
trainer_id = 1
|
||||
trainer_count = 2
|
||||
#with open("/trainer_id", "r") as f:
|
||||
# trainer_id = int(f.readline()[:-1])
|
||||
#with open("/trainer_count", "r") as f:
|
||||
# trainer_count = int(f.readline()[:-1])
|
||||
|
||||
tarPath = os.path.abspath(tar_path)
|
||||
result = []
|
||||
for index, json_line in enumerate(open(inManifest)):
|
||||
if (index % trainer_count) == trainer_id:
|
||||
json_data = json.loads(json_line)
|
||||
json_data['audio_filepath'] = "tar:%s#%s" % (
|
||||
tarPath, json_data['audio_filepath'])
|
||||
result.append("%s\n" % json.dumps(json_data))
|
||||
with open(outManifest, 'w') as manifest:
|
||||
manifest.writelines(result)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
|
||||
parser.add_argument(
|
||||
"--in_manifest_path",
|
||||
default='datasets/dev.mani',
|
||||
type=str,
|
||||
help="Input manifest path. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--data_tar_path",
|
||||
default='datasets/dev.tar',
|
||||
type=str,
|
||||
help="Data tar file path. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--out_manifest_path",
|
||||
default='datasets/dev.mani.split',
|
||||
type=str,
|
||||
help="Out manifest file path. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
split_data(args.in_manifest_path, args.data_tar_path,
|
||||
args.out_manifest_path)
|
@ -0,0 +1,13 @@
|
||||
paddlecloud submit \
|
||||
-image wanghaoshuang/pcloud_ds2 \
|
||||
-jobname ds23 \
|
||||
-cpu 1 \
|
||||
-gpu 0 \
|
||||
-memory 10Gi \
|
||||
-parallelism 1 \
|
||||
-pscpu 1 \
|
||||
-pservers 1 \
|
||||
-psmemory 10Gi \
|
||||
-passes 1 \
|
||||
-entry "sh pcloud_train.sh" \
|
||||
./deep_speech_2
|
@ -0,0 +1,32 @@
|
||||
#setted by user
|
||||
TRAIN_MANI='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.mani'
|
||||
#setted by user
|
||||
DEV_MANI='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.mani'
|
||||
#setted by user
|
||||
TRAIN_TAR='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.tar'
|
||||
#setted by user
|
||||
DEV_TAR='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.tar'
|
||||
#setted by user
|
||||
VOCAB_PATH='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/eng_vocab.txt'
|
||||
#setted by user
|
||||
MEAN_STD_FILE='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/mean_std.npz'
|
||||
|
||||
# split train data for each pcloud node
|
||||
python pcloud_split_data.py \
|
||||
--in_manifest_path=$TRAIN_MANI \
|
||||
--data_tar_path=$TRAIN_TAR \
|
||||
--out_manifest_path='./train.mani'
|
||||
# split dev data for each pcloud node
|
||||
python pcloud_split_data.py \
|
||||
--in_manifest_path=$DEV_MANI \
|
||||
--data_tar_path=$DEV_TAR \
|
||||
--out_manifest_path='./dev.mani'
|
||||
|
||||
python train.py \
|
||||
--use_gpu=0 \
|
||||
--trainer_count=4 \
|
||||
--batch_size=2 \
|
||||
--mean_std_filepath=$MEAN_STD_FILE \
|
||||
--train_manifest_path='./train.mani' \
|
||||
--dev_manifest_path='./dev.mani' \
|
||||
--vocab_filepath=$VOCAB_PATH \
|
Loading…
Reference in new issue