Seperate data uploading from job summission for DS2 cloud training and add support for multiple shards uploading.

8 years ago · f75746cd31
parent af71fccee5
commit f75746cd31
5 changed files with 118 additions and 166 deletions
--- a/cloud/pcloud_submit.sh
+++ b/cloud/pcloud_submit.sh
@ -1,32 +1,11 @@
-# Configure input data set in local filesystem
+TRAIN_MANIFEST="cloud/cloud.manifest.test"
-TRAIN_MANIFEST="../datasets/manifest.train"
+DEV_MANIFEST="cloud/cloud.manifest.dev"
 DEV_MANIFEST="../datasets/manifest.dev"
 VOCAB_FILE="../datasets/vocab/eng_vocab.txt"
 MEAN_STD_FILE="../mean_std.npz"
 # Configure output path in PaddleCloud filesystem
 CLOUD_DATA_DIR="/pfs/dlnel/home/sunxinghai@baidu.com/deepspeech2/data"
 CLOUD_MODEL_DIR="/pfs/dlnel/home/sunxinghai@baidu.com/deepspeech2/model"
-# Configure cloud resources
+BATCH_SIZE=256
 NUM_CPU=8
 NUM_GPU=8
 NUM_NODE=1
 MEMORY="10Gi"
 IS_LOCAL="True"
 # Pack and upload local data to PaddleCloud filesystem
 python upload_data.py \
 --train_manifest_path=${TRAIN_MANIFEST} \
 --dev_manifest_path=${DEV_MANIFEST} \
 --vocab_file=${VOCAB_FILE} \
 --mean_std_file=${MEAN_STD_FILE} \
 --cloud_data_path=${CLOUD_DATA_DIR}
 if [ $? -ne 0 ]
 then
    echo "upload data failed!"
    exit 1
 fi
 # Submit job to PaddleCloud
 JOB_NAME=deepspeech-`date +%Y%m%d%H%M%S`
 DS2_PATH=${PWD%/*}
 cp -f  pcloud_train.sh ${DS2_PATH}
@ -34,15 +13,15 @@ cp -f  pcloud_train.sh ${DS2_PATH}
 paddlecloud submit \
 -image bootstrapper:5000/wanghaoshuang/pcloud_ds2:latest \
 -jobname ${JOB_NAME} \
-cpu ${NUM_CPU} \
+-cpu ${NUM_GPU} \
 -gpu ${NUM_GPU} \
-memory ${MEMORY} \
+-memory 10Gi \
 -parallelism ${NUM_NODE} \
 -pscpu 1 \
 -pservers 1 \
-psmemory ${MEMORY} \
+-psmemory 10Gi \
 -passes 1 \
-entry "sh pcloud_train.sh ${CLOUD_DATA_DIR} ${CLOUD_MODEL_DIR} ${NUM_CPU} ${NUM_GPU} ${IS_LOCAL}" \
+-entry "sh pcloud_train.sh ${TRAIN_MANIFEST} ${DEV_MANIFEST} ${CLOUD_MODEL_DIR} ${NUM_GPU} ${BATCH_SIZE} ${IS_LOCAL}" \
 ${DS2_PATH}
 rm ${DS2_PATH}/pcloud_train.sh
--- a/cloud/pcloud_train.sh
+++ b/cloud/pcloud_train.sh
@ -1,36 +1,24 @@
-DATA_PATH=$1
+TRAIN_MANIFEST=$1
-MODEL_PATH=$2
+DEV_MANIFEST=$2
-NUM_CPU=$3
+MODEL_PATH=$3
 NUM_GPU=$4
-IS_LOCAL=$5
+BATCH_SIZE=$5
 IS_LOCAL=$6
 TRAIN_MANI=${DATA_PATH}/cloud.train.manifest
 DEV_MANI=${DATA_PATH}/cloud.dev.manifest
 TRAIN_TAR=${DATA_PATH}/cloud.train.tar
 DEV_TAR=${DATA_PATH}/cloud.dev.tar
 VOCAB_PATH=${DATA_PATH}/vocab.txt
 MEAN_STD_FILE=${DATA_PATH}/mean_std.npz
 # split train data for each pcloud node
 python ./cloud/split_data.py \
--in_manifest_path=${TRAIN_MANI} \
+--in_manifest_path=${TRAIN_MANIFEST} \
--data_tar_path=${TRAIN_TAR} \
+--out_manifest_path='/local.manifest.train'
 --out_manifest_path='/local.train.manifest'
 # split dev data for each pcloud node
 python ./cloud/split_data.py \
--in_manifest_path=${DEV_MANI} \
+--in_manifest_path=${DEV_MANIFEST} \
--data_tar_path=${DEV_TAR} \
+--out_manifest_path='/local.manifest.dev'
 --out_manifest_path='/local.dev.manifest'
 # run train
 python train.py \
 --batch_size=$BATCH_SIZE \
 --use_gpu=1 \
 --trainer_count=${NUM_GPU} \
--num_threads_data=${NUM_CPU} \
+--num_threads_data=${NUM_GPU} \
 --is_local=${IS_LOCAL} \
--mean_std_filepath=${MEAN_STD_FILE} \
+--train_manifest_path='/local.manifest.train' \
--train_manifest_path='/local.train.manifest' \
+--dev_manifest_path='/local.manifest.dev' \
--dev_manifest_path='/local.dev.manifest' \
+--output_model_dir=${MODEL_PATH} \
 --vocab_filepath=${VOCAB_PATH} \
 --output_model_dir=${MODEL_PATH}
--- a/cloud/pcloud_upload_data.sh
+++ b/cloud/pcloud_upload_data.sh
@ -0,0 +1,17 @@
 IN_MANIFESTS="../datasets/manifest.tmp ../datasets/manifest.dev ../datasets/manifest.test"
 OUT_MANIFESTS="./cloud.manifest.tmp ./cloud.manifest.dev ./cloud.manifest.test"
 CLOUD_DATA_DIR="/pfs/dlnel/home/sunxinghai@baidu.com/deepspeech2/data"
 NUM_SHARDS=10
 python upload_data.py \
 --in_manifest_paths ${IN_MANIFESTS} \
 --out_manifest_paths ${OUT_MANIFESTS} \
 --cloud_data_dir ${CLOUD_DATA_DIR} \
 --num_shards ${NUM_SHARDS}
 if [ $? -ne 0 ]
 then
    echo "Upload Data Failed!"
    exit 1
 fi
 echo "All Done."
--- a/cloud/split_data.py
+++ b/cloud/split_data.py
@ -1,7 +1,5 @@
 """This tool is used for splitting data into each node of
-paddle cloud by total trainer count and current trainer id.
+paddlecloud. This script should be called in paddlecloud.
 The meaning of trainer is a instance of k8s cluster.
 This script should be called in paddle cloud.
 """
 from __future__ import absolute_import
 from __future__ import division
@ -14,40 +12,30 @@ import argparse
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--in_manifest_path",
    default='./cloud.train.manifest',
    type=str,
-    help="Input manifest path. (default: %(default)s)")
+    required=True,
-parser.add_argument(
+    help="Input manifest path for all nodes.")
    "--data_tar_path",
    default='./cloud.train.tar',
    type=str,
    help="Data tar file path. (default: %(default)s)")
 parser.add_argument(
    "--out_manifest_path",
    default='./local.train.manifest',
    type=str,
-    help="Out manifest file path. (default: %(default)s)")
+    required=True,
    help="Output manifest file path for current node.")
 args = parser.parse_args()
-def split_data(in_manifest, tar_path, out_manifest):
+def split_data(in_manifest_path, out_manifest_path):
    with open("/trainer_id", "r") as f:
        trainer_id = int(f.readline()[:-1])
    with open("/trainer_count", "r") as f:
        trainer_count = int(f.readline()[:-1])
-    tar_path = os.path.abspath(tar_path)
+    out_manifest = []
-    result = []
+    for index, json_line in enumerate(open(in_manifest_path, 'r')):
    for index, json_line in enumerate(open(in_manifest)):
        if (index % trainer_count) == trainer_id:
-            json_data = json.loads(json_line)
+            out_manifest.append("%s\n" % json_line.strip())
-            json_data['audio_filepath'] = "tar:%s#%s" % (
+    with open(out_manifest_path, 'w') as f:
-                tar_path, json_data['audio_filepath'])
+        f.writelines(out_manifest)
            result.append("%s\n" % json.dumps(json_data))
    with open(out_manifest, 'w') as manifest:
        manifest.writelines(result)
 if __name__ == '__main__':
-    split_data(args.in_manifest_path, args.data_tar_path,
+    split_data(args.in_manifest_path, args.out_manifest_path)
               args.out_manifest_path)
--- a/cloud/upload_data.py
+++ b/cloud/upload_data.py
@ -1,12 +1,9 @@
-"""This script is used for preparing data for DeepSpeech2 trainning on paddle
+"""This script is for uploading data for DeepSpeech2 training on paddlecloud.
 cloud.
 Steps:
-1. Read original manifest and get the local path of sound files.
+1. Read original manifests and extract local sound files.
-2. Tar all local sound files into one tar file.
+2. Tar all local sound files into multiple tar files and upload them.
-3. Modify original manifest to remove the local path information.
+3. Modify original manifests with updated paths in cloud filesystem.
 Finally, we will get a tar file and a new manifest.
 """
 from __future__ import absolute_import
 from __future__ import division
@ -22,66 +19,81 @@ from subprocess import call
 import _init_paths
 from data_utils.utils import read_manifest
 TRAIN_TAR = "cloud.train.tar"
 TRAIN_MANIFEST = "cloud.train.manifest"
 DEV_TAR = "cloud.dev.tar"
 DEV_MANIFEST = "cloud.dev.manifest"
 VOCAB_FILE = "vocab.txt"
 MEAN_STD_FILE = "mean_std.npz"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
-    "--train_manifest_path",
+    "--in_manifest_paths",
-    default="../datasets/manifest.train",
+    default=["../datasets/manifest.test", "../datasets/manifest.dev"],
    type=str,
    help="Manifest file path for train data. (default: %(default)s)")
 parser.add_argument(
    "--dev_manifest_path",
    default="../datasets/manifest.dev",
    type=str,
-    help="Manifest file path for validation data. (default: %(default)s)")
+    nargs='+',
-parser.add_argument(
+    help="Local filepaths of input manifests to load, pack and upload."
    "--vocab_file",
    default="../datasets/vocab/eng_vocab.txt",
    type=str,
    help="Vocabulary file to be uploaded to paddlecloud. "
    "(default: %(default)s)")
 parser.add_argument(
-    "--mean_std_file",
+    "--out_manifest_paths",
-    default="../mean_std.npz",
+    default=["./cloud.manifest.test", "./cloud.manifest.dev"],
    type=str,
-    help="Normalizer's statistics (mean and stddev) file to be uploaded to "
+    nargs='+',
-    "paddlecloud. (default: %(default)s)")
+    help="Local filepaths of modified manifests to write to. "
    "(default: %(default)s)")
 parser.add_argument(
-    "--cloud_data_path",
+    "--cloud_data_dir",
    required=True,
    type=str,
-    help="Destination path on paddlecloud. (default: %(default)s)")
+    help="Destination directory on paddlecloud to upload data to.")
 parser.add_argument(
    "--num_shards",
    default=10,
    type=int,
    help="Number of parts to split data to. (default: %(default)s)")
 parser.add_argument(
-    "--local_tmp_path",
+    "--local_tmp_dir",
    default="./tmp/",
    type=str,
    help="Local directory for storing temporary data. (default: %(default)s)")
 args = parser.parse_args()
-def pack_data(manifest_path, out_tar_path, out_manifest_path):
+def upload_data(in_manifest_path_list, out_manifest_path_list, local_tmp_dir,
-    """1. According to the manifest, tar sound files into out_tar_path.
+                upload_tar_dir, num_shards):
-    2. Generate a new manifest for output tar file.
+    """Extract and pack sound files listed in the manifest files into multple
    tar files and upload them to padldecloud. Besides, generate new manifest
    files with updated paths in paddlecloud.
    """
-    out_tar = tarfile.open(out_tar_path, 'w')
+    # compute total audio number
-    manifest = read_manifest(manifest_path)
+    total_line = 0
-    results = []
+    for manifest_path in in_manifest_path_list:
-    for json_data in manifest:
+        with open(manifest_path, 'r') as f:
-        sound_file = json_data['audio_filepath']
+            total_line += len(f.readlines())
-        filename = os.path.basename(sound_file)
+    line_per_tar = (total_line // num_shards) + 1
-        out_tar.add(sound_file, arcname=filename)
+
-        json_data['audio_filepath'] = filename
+    # pack and upload shard by shard
-        results.append("%s\n" % json.dumps(json_data))
+    line_count, tar_file = 0, None
-    with open(out_manifest_path, 'w') as out_manifest:
+    for manifest_path, out_manifest_path in zip(in_manifest_path_list,
-        out_manifest.writelines(results)
+                                                out_manifest_path_list):
-    out_manifest.close()
+        manifest = read_manifest(manifest_path)
-    out_tar.close()
+        out_manifest = []
        for json_data in manifest:
            sound_filepath = json_data['audio_filepath']
            sound_filename = os.path.basename(sound_filepath)
            if line_count % line_per_tar == 0:
                if tar_file != None:
                    tar_file.close()
                    pcloud_cp(tar_path, upload_tar_dir)
                    os.remove(tar_path)
                tar_name = 'part-%s-of-%s.tar' % (
                    str(line_count // line_per_tar).zfill(5),
                    str(num_shards).zfill(5))
                tar_path = os.path.join(local_tmp_dir, tar_name)
                tar_file = tarfile.open(tar_path, 'w')
            tar_file.add(sound_filepath, arcname=sound_filename)
            line_count += 1
            json_data['audio_filepath'] = "tar:%s#%s" % (
                os.path.join(upload_tar_dir, tar_name), sound_filename)
            out_manifest.append("%s\n" % json.dumps(json_data))
        with open(out_manifest_path, 'w') as f:
            f.writelines(out_manifest)
    tar_file.close()
    pcloud_cp(tar_path, upload_tar_dir)
    os.remove(tar_path)
 def pcloud_mkdir(dir):
@ -99,44 +111,12 @@ def pcloud_cp(src, dst):
        raise IOError("PaddleCloud cp failed: from [%s] to [%s]." % (src, dst))
 def pcloud_exist(path):
    """Check if file or directory exists in PaddleCloud filesystem.
    """
    ret = call(['paddlecloud', 'ls', path])
    return ret
 if __name__ == '__main__':
-    cloud_train_manifest = os.path.join(args.cloud_data_path, TRAIN_MANIFEST)
+    if not os.path.exists(args.local_tmp_dir):
-    cloud_train_tar = os.path.join(args.cloud_data_path, TRAIN_TAR)
+        os.makedirs(args.local_tmp_dir)
-    cloud_dev_manifest = os.path.join(args.cloud_data_path, DEV_MANIFEST)
+    pcloud_mkdir(args.cloud_data_dir)
    cloud_dev_tar = os.path.join(args.cloud_data_path, DEV_TAR)
    cloud_vocab_file = os.path.join(args.cloud_data_path, VOCAB_FILE)
    cloud_mean_file = os.path.join(args.cloud_data_path, MEAN_STD_FILE)
    local_train_manifest = os.path.join(args.local_tmp_path, TRAIN_MANIFEST)
    local_train_tar = os.path.join(args.local_tmp_path, TRAIN_TAR)
    local_dev_manifest = os.path.join(args.local_tmp_path, DEV_MANIFEST)
    local_dev_tar = os.path.join(args.local_tmp_path, DEV_TAR)
    # prepare local and cloud dir
    if os.path.exists(args.local_tmp_path):
        shutil.rmtree(args.local_tmp_path)
    os.makedirs(args.local_tmp_path)
    pcloud_mkdir(args.cloud_data_path)
    # pack and upload train data
    pack_data(args.train_manifest_path, local_train_tar, local_train_manifest)
    pcloud_cp(local_train_manifest, cloud_train_manifest)
    pcloud_cp(local_train_tar, cloud_train_tar)
    # pack and upload validation data
    pack_data(args.dev_manifest_path, local_dev_tar, local_dev_manifest)
    pcloud_cp(local_dev_manifest, cloud_dev_manifest)
    pcloud_cp(local_dev_tar, cloud_dev_tar)
-    # upload vocab file and mean_std file
+    upload_data(args.in_manifest_paths, args.out_manifest_paths,
-    pcloud_cp(args.vocab_file, cloud_vocab_file)
+                args.local_tmp_dir, args.cloud_data_dir, 10)
    pcloud_cp(args.mean_std_file, cloud_mean_file)
-    shutil.rmtree(args.local_tmp_path)
+    shutil.rmtree(args.local_tmp_dir)