Seperate data uploading from job summission for DS2 cloud training and add support for multiple shards uploading.

pull/2/head
Xinghai Sun 7 years ago
parent af71fccee5
commit f75746cd31

@ -1,32 +1,11 @@
# Configure input data set in local filesystem TRAIN_MANIFEST="cloud/cloud.manifest.test"
TRAIN_MANIFEST="../datasets/manifest.train" DEV_MANIFEST="cloud/cloud.manifest.dev"
DEV_MANIFEST="../datasets/manifest.dev"
VOCAB_FILE="../datasets/vocab/eng_vocab.txt"
MEAN_STD_FILE="../mean_std.npz"
# Configure output path in PaddleCloud filesystem
CLOUD_DATA_DIR="/pfs/dlnel/home/sunxinghai@baidu.com/deepspeech2/data"
CLOUD_MODEL_DIR="/pfs/dlnel/home/sunxinghai@baidu.com/deepspeech2/model" CLOUD_MODEL_DIR="/pfs/dlnel/home/sunxinghai@baidu.com/deepspeech2/model"
# Configure cloud resources BATCH_SIZE=256
NUM_CPU=8
NUM_GPU=8 NUM_GPU=8
NUM_NODE=1 NUM_NODE=1
MEMORY="10Gi"
IS_LOCAL="True" IS_LOCAL="True"
# Pack and upload local data to PaddleCloud filesystem
python upload_data.py \
--train_manifest_path=${TRAIN_MANIFEST} \
--dev_manifest_path=${DEV_MANIFEST} \
--vocab_file=${VOCAB_FILE} \
--mean_std_file=${MEAN_STD_FILE} \
--cloud_data_path=${CLOUD_DATA_DIR}
if [ $? -ne 0 ]
then
echo "upload data failed!"
exit 1
fi
# Submit job to PaddleCloud
JOB_NAME=deepspeech-`date +%Y%m%d%H%M%S` JOB_NAME=deepspeech-`date +%Y%m%d%H%M%S`
DS2_PATH=${PWD%/*} DS2_PATH=${PWD%/*}
cp -f pcloud_train.sh ${DS2_PATH} cp -f pcloud_train.sh ${DS2_PATH}
@ -34,15 +13,15 @@ cp -f pcloud_train.sh ${DS2_PATH}
paddlecloud submit \ paddlecloud submit \
-image bootstrapper:5000/wanghaoshuang/pcloud_ds2:latest \ -image bootstrapper:5000/wanghaoshuang/pcloud_ds2:latest \
-jobname ${JOB_NAME} \ -jobname ${JOB_NAME} \
-cpu ${NUM_CPU} \ -cpu ${NUM_GPU} \
-gpu ${NUM_GPU} \ -gpu ${NUM_GPU} \
-memory ${MEMORY} \ -memory 10Gi \
-parallelism ${NUM_NODE} \ -parallelism ${NUM_NODE} \
-pscpu 1 \ -pscpu 1 \
-pservers 1 \ -pservers 1 \
-psmemory ${MEMORY} \ -psmemory 10Gi \
-passes 1 \ -passes 1 \
-entry "sh pcloud_train.sh ${CLOUD_DATA_DIR} ${CLOUD_MODEL_DIR} ${NUM_CPU} ${NUM_GPU} ${IS_LOCAL}" \ -entry "sh pcloud_train.sh ${TRAIN_MANIFEST} ${DEV_MANIFEST} ${CLOUD_MODEL_DIR} ${NUM_GPU} ${BATCH_SIZE} ${IS_LOCAL}" \
${DS2_PATH} ${DS2_PATH}
rm ${DS2_PATH}/pcloud_train.sh rm ${DS2_PATH}/pcloud_train.sh

@ -1,36 +1,24 @@
DATA_PATH=$1 TRAIN_MANIFEST=$1
MODEL_PATH=$2 DEV_MANIFEST=$2
NUM_CPU=$3 MODEL_PATH=$3
NUM_GPU=$4 NUM_GPU=$4
IS_LOCAL=$5 BATCH_SIZE=$5
IS_LOCAL=$6
TRAIN_MANI=${DATA_PATH}/cloud.train.manifest
DEV_MANI=${DATA_PATH}/cloud.dev.manifest
TRAIN_TAR=${DATA_PATH}/cloud.train.tar
DEV_TAR=${DATA_PATH}/cloud.dev.tar
VOCAB_PATH=${DATA_PATH}/vocab.txt
MEAN_STD_FILE=${DATA_PATH}/mean_std.npz
# split train data for each pcloud node
python ./cloud/split_data.py \ python ./cloud/split_data.py \
--in_manifest_path=${TRAIN_MANI} \ --in_manifest_path=${TRAIN_MANIFEST} \
--data_tar_path=${TRAIN_TAR} \ --out_manifest_path='/local.manifest.train'
--out_manifest_path='/local.train.manifest'
# split dev data for each pcloud node
python ./cloud/split_data.py \ python ./cloud/split_data.py \
--in_manifest_path=${DEV_MANI} \ --in_manifest_path=${DEV_MANIFEST} \
--data_tar_path=${DEV_TAR} \ --out_manifest_path='/local.manifest.dev'
--out_manifest_path='/local.dev.manifest'
# run train
python train.py \ python train.py \
--batch_size=$BATCH_SIZE \
--use_gpu=1 \ --use_gpu=1 \
--trainer_count=${NUM_GPU} \ --trainer_count=${NUM_GPU} \
--num_threads_data=${NUM_CPU} \ --num_threads_data=${NUM_GPU} \
--is_local=${IS_LOCAL} \ --is_local=${IS_LOCAL} \
--mean_std_filepath=${MEAN_STD_FILE} \ --train_manifest_path='/local.manifest.train' \
--train_manifest_path='/local.train.manifest' \ --dev_manifest_path='/local.manifest.dev' \
--dev_manifest_path='/local.dev.manifest' \ --output_model_dir=${MODEL_PATH} \
--vocab_filepath=${VOCAB_PATH} \
--output_model_dir=${MODEL_PATH}

@ -0,0 +1,17 @@
IN_MANIFESTS="../datasets/manifest.tmp ../datasets/manifest.dev ../datasets/manifest.test"
OUT_MANIFESTS="./cloud.manifest.tmp ./cloud.manifest.dev ./cloud.manifest.test"
CLOUD_DATA_DIR="/pfs/dlnel/home/sunxinghai@baidu.com/deepspeech2/data"
NUM_SHARDS=10
python upload_data.py \
--in_manifest_paths ${IN_MANIFESTS} \
--out_manifest_paths ${OUT_MANIFESTS} \
--cloud_data_dir ${CLOUD_DATA_DIR} \
--num_shards ${NUM_SHARDS}
if [ $? -ne 0 ]
then
echo "Upload Data Failed!"
exit 1
fi
echo "All Done."

@ -1,7 +1,5 @@
"""This tool is used for splitting data into each node of """This tool is used for splitting data into each node of
paddle cloud by total trainer count and current trainer id. paddlecloud. This script should be called in paddlecloud.
The meaning of trainer is a instance of k8s cluster.
This script should be called in paddle cloud.
""" """
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
@ -14,40 +12,30 @@ import argparse
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( parser.add_argument(
"--in_manifest_path", "--in_manifest_path",
default='./cloud.train.manifest',
type=str, type=str,
help="Input manifest path. (default: %(default)s)") required=True,
parser.add_argument( help="Input manifest path for all nodes.")
"--data_tar_path",
default='./cloud.train.tar',
type=str,
help="Data tar file path. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--out_manifest_path", "--out_manifest_path",
default='./local.train.manifest',
type=str, type=str,
help="Out manifest file path. (default: %(default)s)") required=True,
help="Output manifest file path for current node.")
args = parser.parse_args() args = parser.parse_args()
def split_data(in_manifest, tar_path, out_manifest): def split_data(in_manifest_path, out_manifest_path):
with open("/trainer_id", "r") as f: with open("/trainer_id", "r") as f:
trainer_id = int(f.readline()[:-1]) trainer_id = int(f.readline()[:-1])
with open("/trainer_count", "r") as f: with open("/trainer_count", "r") as f:
trainer_count = int(f.readline()[:-1]) trainer_count = int(f.readline()[:-1])
tar_path = os.path.abspath(tar_path) out_manifest = []
result = [] for index, json_line in enumerate(open(in_manifest_path, 'r')):
for index, json_line in enumerate(open(in_manifest)):
if (index % trainer_count) == trainer_id: if (index % trainer_count) == trainer_id:
json_data = json.loads(json_line) out_manifest.append("%s\n" % json_line.strip())
json_data['audio_filepath'] = "tar:%s#%s" % ( with open(out_manifest_path, 'w') as f:
tar_path, json_data['audio_filepath']) f.writelines(out_manifest)
result.append("%s\n" % json.dumps(json_data))
with open(out_manifest, 'w') as manifest:
manifest.writelines(result)
if __name__ == '__main__': if __name__ == '__main__':
split_data(args.in_manifest_path, args.data_tar_path, split_data(args.in_manifest_path, args.out_manifest_path)
args.out_manifest_path)

@ -1,12 +1,9 @@
"""This script is used for preparing data for DeepSpeech2 trainning on paddle """This script is for uploading data for DeepSpeech2 training on paddlecloud.
cloud.
Steps: Steps:
1. Read original manifest and get the local path of sound files. 1. Read original manifests and extract local sound files.
2. Tar all local sound files into one tar file. 2. Tar all local sound files into multiple tar files and upload them.
3. Modify original manifest to remove the local path information. 3. Modify original manifests with updated paths in cloud filesystem.
Finally, we will get a tar file and a new manifest.
""" """
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
@ -22,66 +19,81 @@ from subprocess import call
import _init_paths import _init_paths
from data_utils.utils import read_manifest from data_utils.utils import read_manifest
TRAIN_TAR = "cloud.train.tar"
TRAIN_MANIFEST = "cloud.train.manifest"
DEV_TAR = "cloud.dev.tar"
DEV_MANIFEST = "cloud.dev.manifest"
VOCAB_FILE = "vocab.txt"
MEAN_STD_FILE = "mean_std.npz"
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( parser.add_argument(
"--train_manifest_path", "--in_manifest_paths",
default="../datasets/manifest.train", default=["../datasets/manifest.test", "../datasets/manifest.dev"],
type=str,
help="Manifest file path for train data. (default: %(default)s)")
parser.add_argument(
"--dev_manifest_path",
default="../datasets/manifest.dev",
type=str, type=str,
help="Manifest file path for validation data. (default: %(default)s)") nargs='+',
parser.add_argument( help="Local filepaths of input manifests to load, pack and upload."
"--vocab_file",
default="../datasets/vocab/eng_vocab.txt",
type=str,
help="Vocabulary file to be uploaded to paddlecloud. "
"(default: %(default)s)") "(default: %(default)s)")
parser.add_argument( parser.add_argument(
"--mean_std_file", "--out_manifest_paths",
default="../mean_std.npz", default=["./cloud.manifest.test", "./cloud.manifest.dev"],
type=str, type=str,
help="Normalizer's statistics (mean and stddev) file to be uploaded to " nargs='+',
"paddlecloud. (default: %(default)s)") help="Local filepaths of modified manifests to write to. "
"(default: %(default)s)")
parser.add_argument( parser.add_argument(
"--cloud_data_path", "--cloud_data_dir",
required=True, required=True,
type=str, type=str,
help="Destination path on paddlecloud. (default: %(default)s)") help="Destination directory on paddlecloud to upload data to.")
parser.add_argument(
"--num_shards",
default=10,
type=int,
help="Number of parts to split data to. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--local_tmp_path", "--local_tmp_dir",
default="./tmp/", default="./tmp/",
type=str, type=str,
help="Local directory for storing temporary data. (default: %(default)s)") help="Local directory for storing temporary data. (default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
def pack_data(manifest_path, out_tar_path, out_manifest_path): def upload_data(in_manifest_path_list, out_manifest_path_list, local_tmp_dir,
"""1. According to the manifest, tar sound files into out_tar_path. upload_tar_dir, num_shards):
2. Generate a new manifest for output tar file. """Extract and pack sound files listed in the manifest files into multple
tar files and upload them to padldecloud. Besides, generate new manifest
files with updated paths in paddlecloud.
""" """
out_tar = tarfile.open(out_tar_path, 'w') # compute total audio number
manifest = read_manifest(manifest_path) total_line = 0
results = [] for manifest_path in in_manifest_path_list:
for json_data in manifest: with open(manifest_path, 'r') as f:
sound_file = json_data['audio_filepath'] total_line += len(f.readlines())
filename = os.path.basename(sound_file) line_per_tar = (total_line // num_shards) + 1
out_tar.add(sound_file, arcname=filename)
json_data['audio_filepath'] = filename # pack and upload shard by shard
results.append("%s\n" % json.dumps(json_data)) line_count, tar_file = 0, None
with open(out_manifest_path, 'w') as out_manifest: for manifest_path, out_manifest_path in zip(in_manifest_path_list,
out_manifest.writelines(results) out_manifest_path_list):
out_manifest.close() manifest = read_manifest(manifest_path)
out_tar.close() out_manifest = []
for json_data in manifest:
sound_filepath = json_data['audio_filepath']
sound_filename = os.path.basename(sound_filepath)
if line_count % line_per_tar == 0:
if tar_file != None:
tar_file.close()
pcloud_cp(tar_path, upload_tar_dir)
os.remove(tar_path)
tar_name = 'part-%s-of-%s.tar' % (
str(line_count // line_per_tar).zfill(5),
str(num_shards).zfill(5))
tar_path = os.path.join(local_tmp_dir, tar_name)
tar_file = tarfile.open(tar_path, 'w')
tar_file.add(sound_filepath, arcname=sound_filename)
line_count += 1
json_data['audio_filepath'] = "tar:%s#%s" % (
os.path.join(upload_tar_dir, tar_name), sound_filename)
out_manifest.append("%s\n" % json.dumps(json_data))
with open(out_manifest_path, 'w') as f:
f.writelines(out_manifest)
tar_file.close()
pcloud_cp(tar_path, upload_tar_dir)
os.remove(tar_path)
def pcloud_mkdir(dir): def pcloud_mkdir(dir):
@ -99,44 +111,12 @@ def pcloud_cp(src, dst):
raise IOError("PaddleCloud cp failed: from [%s] to [%s]." % (src, dst)) raise IOError("PaddleCloud cp failed: from [%s] to [%s]." % (src, dst))
def pcloud_exist(path):
"""Check if file or directory exists in PaddleCloud filesystem.
"""
ret = call(['paddlecloud', 'ls', path])
return ret
if __name__ == '__main__': if __name__ == '__main__':
cloud_train_manifest = os.path.join(args.cloud_data_path, TRAIN_MANIFEST) if not os.path.exists(args.local_tmp_dir):
cloud_train_tar = os.path.join(args.cloud_data_path, TRAIN_TAR) os.makedirs(args.local_tmp_dir)
cloud_dev_manifest = os.path.join(args.cloud_data_path, DEV_MANIFEST) pcloud_mkdir(args.cloud_data_dir)
cloud_dev_tar = os.path.join(args.cloud_data_path, DEV_TAR)
cloud_vocab_file = os.path.join(args.cloud_data_path, VOCAB_FILE)
cloud_mean_file = os.path.join(args.cloud_data_path, MEAN_STD_FILE)
local_train_manifest = os.path.join(args.local_tmp_path, TRAIN_MANIFEST)
local_train_tar = os.path.join(args.local_tmp_path, TRAIN_TAR)
local_dev_manifest = os.path.join(args.local_tmp_path, DEV_MANIFEST)
local_dev_tar = os.path.join(args.local_tmp_path, DEV_TAR)
# prepare local and cloud dir
if os.path.exists(args.local_tmp_path):
shutil.rmtree(args.local_tmp_path)
os.makedirs(args.local_tmp_path)
pcloud_mkdir(args.cloud_data_path)
# pack and upload train data
pack_data(args.train_manifest_path, local_train_tar, local_train_manifest)
pcloud_cp(local_train_manifest, cloud_train_manifest)
pcloud_cp(local_train_tar, cloud_train_tar)
# pack and upload validation data
pack_data(args.dev_manifest_path, local_dev_tar, local_dev_manifest)
pcloud_cp(local_dev_manifest, cloud_dev_manifest)
pcloud_cp(local_dev_tar, cloud_dev_tar)
# upload vocab file and mean_std file upload_data(args.in_manifest_paths, args.out_manifest_paths,
pcloud_cp(args.vocab_file, cloud_vocab_file) args.local_tmp_dir, args.cloud_data_dir, 10)
pcloud_cp(args.mean_std_file, cloud_mean_file)
shutil.rmtree(args.local_tmp_path) shutil.rmtree(args.local_tmp_dir)

Loading…
Cancel
Save