1. Refine data_utils/data.py to read bytes from tar file 2. Add scripts to submit paddle cloud job for ds2 trainningpull/2/head
parent
db37c34919
commit
3c77d369ca
@ -0,0 +1,51 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import tarfile
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument(
|
||||||
|
"--manifest_path",
|
||||||
|
default="/manifest.train",
|
||||||
|
type=str,
|
||||||
|
help="Manifest of target data. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--out_tar_path",
|
||||||
|
default="/dev.tar",
|
||||||
|
type=str,
|
||||||
|
help="Output tar file path. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--out_manifest_path",
|
||||||
|
default="/dev.mani",
|
||||||
|
type=str,
|
||||||
|
help="Manifest of output data. (default: %(default)s)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def gen_pcloud_data(manifest_path, out_tar_path, out_manifest_path):
|
||||||
|
'''
|
||||||
|
1. According manifest, tar sound files into out_tar_path
|
||||||
|
2. Generate a new manifest for output tar file
|
||||||
|
'''
|
||||||
|
out_tar = tarfile.open(out_tar_path, 'w')
|
||||||
|
manifest = []
|
||||||
|
for json_line in open(manifest_path):
|
||||||
|
try:
|
||||||
|
json_data = json.loads(json_line)
|
||||||
|
except Exception as e:
|
||||||
|
raise IOError("Error reading manifest: %s" % str(e))
|
||||||
|
sound_file = json_data['audio_filepath']
|
||||||
|
filename = os.path.basename(sound_file)
|
||||||
|
out_tar.add(sound_file, arcname=filename)
|
||||||
|
json_data['audio_filepath'] = filename
|
||||||
|
manifest.append("%s\n" % json.dumps(json_data))
|
||||||
|
with open(out_manifest_path, 'w') as out_manifest:
|
||||||
|
out_manifest.writelines(manifest)
|
||||||
|
out_manifest.close()
|
||||||
|
out_tar.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
gen_pcloud_data(args.manifest_path, args.out_tar_path,
|
||||||
|
args.out_manifest_path)
|
@ -0,0 +1,47 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def split_data(inManifest, tar_path, outManifest):
|
||||||
|
trainer_id = 1
|
||||||
|
trainer_count = 2
|
||||||
|
#with open("/trainer_id", "r") as f:
|
||||||
|
# trainer_id = int(f.readline()[:-1])
|
||||||
|
#with open("/trainer_count", "r") as f:
|
||||||
|
# trainer_count = int(f.readline()[:-1])
|
||||||
|
|
||||||
|
tarPath = os.path.abspath(tar_path)
|
||||||
|
result = []
|
||||||
|
for index, json_line in enumerate(open(inManifest)):
|
||||||
|
if (index % trainer_count) == trainer_id:
|
||||||
|
json_data = json.loads(json_line)
|
||||||
|
json_data['audio_filepath'] = "tar:%s#%s" % (
|
||||||
|
tarPath, json_data['audio_filepath'])
|
||||||
|
result.append("%s\n" % json.dumps(json_data))
|
||||||
|
with open(outManifest, 'w') as manifest:
|
||||||
|
manifest.writelines(result)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--in_manifest_path",
|
||||||
|
default='datasets/dev.mani',
|
||||||
|
type=str,
|
||||||
|
help="Input manifest path. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--data_tar_path",
|
||||||
|
default='datasets/dev.tar',
|
||||||
|
type=str,
|
||||||
|
help="Data tar file path. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--out_manifest_path",
|
||||||
|
default='datasets/dev.mani.split',
|
||||||
|
type=str,
|
||||||
|
help="Out manifest file path. (default: %(default)s)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
split_data(args.in_manifest_path, args.data_tar_path,
|
||||||
|
args.out_manifest_path)
|
@ -0,0 +1,13 @@
|
|||||||
|
paddlecloud submit \
|
||||||
|
-image wanghaoshuang/pcloud_ds2 \
|
||||||
|
-jobname ds23 \
|
||||||
|
-cpu 1 \
|
||||||
|
-gpu 0 \
|
||||||
|
-memory 10Gi \
|
||||||
|
-parallelism 1 \
|
||||||
|
-pscpu 1 \
|
||||||
|
-pservers 1 \
|
||||||
|
-psmemory 10Gi \
|
||||||
|
-passes 1 \
|
||||||
|
-entry "sh pcloud_train.sh" \
|
||||||
|
./deep_speech_2
|
@ -0,0 +1,32 @@
|
|||||||
|
#setted by user
|
||||||
|
TRAIN_MANI='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.mani'
|
||||||
|
#setted by user
|
||||||
|
DEV_MANI='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.mani'
|
||||||
|
#setted by user
|
||||||
|
TRAIN_TAR='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.tar'
|
||||||
|
#setted by user
|
||||||
|
DEV_TAR='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.tar'
|
||||||
|
#setted by user
|
||||||
|
VOCAB_PATH='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/eng_vocab.txt'
|
||||||
|
#setted by user
|
||||||
|
MEAN_STD_FILE='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/mean_std.npz'
|
||||||
|
|
||||||
|
# split train data for each pcloud node
|
||||||
|
python pcloud_split_data.py \
|
||||||
|
--in_manifest_path=$TRAIN_MANI \
|
||||||
|
--data_tar_path=$TRAIN_TAR \
|
||||||
|
--out_manifest_path='./train.mani'
|
||||||
|
# split dev data for each pcloud node
|
||||||
|
python pcloud_split_data.py \
|
||||||
|
--in_manifest_path=$DEV_MANI \
|
||||||
|
--data_tar_path=$DEV_TAR \
|
||||||
|
--out_manifest_path='./dev.mani'
|
||||||
|
|
||||||
|
python train.py \
|
||||||
|
--use_gpu=0 \
|
||||||
|
--trainer_count=4 \
|
||||||
|
--batch_size=2 \
|
||||||
|
--mean_std_filepath=$MEAN_STD_FILE \
|
||||||
|
--train_manifest_path='./train.mani' \
|
||||||
|
--dev_manifest_path='./dev.mani' \
|
||||||
|
--vocab_filepath=$VOCAB_PATH \
|
Loading…
Reference in new issue