From b648f0c2d1161b8b520316bf137a3fd9d79b2eb1 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Thu, 10 Aug 2017 11:52:25 +0800 Subject: [PATCH] Implement uploading data in submit scripts and fix issues --- cloud/README.md | 15 +---- cloud/pcloud_submit.sh | 55 +++++++++++++++++-- cloud/pcloud_train.sh | 26 ++++----- ...pcloud_prepare_data.py => prepare_data.py} | 4 +- cloud/{pcloud_split_data.py => split_data.py} | 6 +- pcloud_train.sh | 26 ++++----- 6 files changed, 80 insertions(+), 52 deletions(-) rename cloud/{pcloud_prepare_data.py => prepare_data.py} (95%) rename cloud/{pcloud_split_data.py => split_data.py} (92%) diff --git a/cloud/README.md b/cloud/README.md index e7855ba8..7c23e0dc 100644 --- a/cloud/README.md +++ b/cloud/README.md @@ -21,21 +21,8 @@ The we can get job name 'deepspeech20170727130129' at last line ``` $ paddlecloud logs -n 10000 deepspeech20170727130129 -$ ==========================deepspeech20170727130129-trainer-6vk3m========================== -label selector: paddle-job-pserver=deepspeech20170727130129, desired: 1 -running pod list: [('Running', '10.1.3.6')] -label selector: paddle-job=deepspeech20170727130129, desired: 1 -running pod list: [('Running', '10.1.83.14')] -Starting training job: /pfs/dlnel/home/****@baidu.com/jobs/deepspeech20170727130129, num_gradient_servers: 1, trainer_id: 0, version: v2 -I0727 05:01:42.969719 25 Util.cpp:166] commandline: --num_gradient_servers=1 --ports_num_for_sparse=1 --use_gpu=1 --trainer_id=0 --pservers=10.1.3.6 --trainer_count=4 --num_passes=1 --ports_num=1 --port=7164 -[INFO 2017-07-27 05:01:50,279 layers.py:2430] output for __conv_0__: c = 32, h = 81, w = 54, size = 139968 -[WARNING 2017-07-27 05:01:50,280 layers.py:2789] brelu is not recommend for batch normalization's activation, maybe the relu is better -[INFO 2017-07-27 05:01:50,283 layers.py:2430] output for __conv_1__: c = 32, h = 41, w = 54, size = 70848 -I0727 05:01:50.316176 25 MultiGradientMachine.cpp:99] numLogicalDevices=1 numThreads=4 numDevices=4 -I0727 05:01:50.454787 25 GradientMachine.cpp:85] Initing parameters.. -I0727 05:01:50.690007 25 GradientMachine.cpp:92] Init parameters done. ``` -[More optins and cmd aoubt paddle cloud](https://github.com/PaddlePaddle/cloud/blob/develop/doc/usage_cn.md) +[More options and cmd about paddle cloud](https://github.com/PaddlePaddle/cloud/blob/develop/doc/usage_cn.md) ## Run DS2 by customize data TODO diff --git a/cloud/pcloud_submit.sh b/cloud/pcloud_submit.sh index 5d053501..9ea5d931 100644 --- a/cloud/pcloud_submit.sh +++ b/cloud/pcloud_submit.sh @@ -1,9 +1,54 @@ -DS2_PATH=../ -tar -czf deepspeech.tar.gz ${DS2_PATH} +# +TRAIN_MANIFEST="/home/work/wanghaoshuang/ds2/pcloud/models/deep_speech_2/datasets/manifest.dev" +TEST_MANIFEST="/home/work/wanghaoshuang/ds2/pcloud/models/deep_speech_2/datasets/manifest.dev" +VOCAB_PATH="/home/work/wanghaoshuang/ds2/pcloud/models/deep_speech_2/datasets/vocab/eng_vocab.txt" +MEAN_STD_PATH="/home/work/wanghaoshuang/ds2/pcloud/models/deep_speech_2/compute_mean_std.py" +CLOUD_DATA_DIR="/pfs/dlnel/home/wanghaoshuang@baidu.com/deepspeech2/data" +CLOUD_MODEL_DIR="/pfs/dlnel/home/wanghaoshuang@baidu.com/deepspeech2/model" + +DS2_PATH=${PWD%/*} + +rm -rf ./tmp +mkdir ./tmp + +paddlecloud ls ${CLOUD_DATA_DIR}/mean_std.npz +if [ $? -ne 0 ];then + cp -f ${MEAN_STD_PATH} ./tmp/mean_std.npz + paddlecloud file put ./tmp/mean_std.npz ${CLOUD_DATA_DIR}/ +fi + +paddlecloud ls ${CLOUD_DATA_DIR}/vocab.txt +if [ $? -ne 0 ];then + cp -f ${VOCAB_PATH} ./tmp/vocab.txt + paddlecloud file put ./tmp/vocab.txt ${CLOUD_DATA_DIR}/ +fi + +paddlecloud ls ${CLOUD_DATA_DIR}/cloud.train.manifest +if [ $? -ne 0 ];then +python prepare_data.py \ +--manifest_path=${TRAIN_MANIFEST} \ +--out_tar_path="./tmp/cloud.train.tar" \ +--out_manifest_path="tmp/cloud.train.manifest" +paddlecloud file put ./tmp/cloud.train.tar ${CLOUD_DATA_DIR}/ +paddlecloud file put ./tmp/cloud.train.manifest ${CLOUD_DATA_DIR}/ +fi + +paddlecloud ls ${CLOUD_DATA_DIR}/cloud.test.manifest +if [ $? -ne 0 ];then +python prepare_data.py \ +--manifest_path=${TEST_MANIFEST} \ +--out_tar_path="./tmp/cloud.test.tar" \ +--out_manifest_path="tmp/cloud.test.manifest" +paddlecloud file put ./tmp/cloud.test.tar ${CLOUD_DATA_DIR}/ +paddlecloud file put ./tmp/cloud.test.manifest ${CLOUD_DATA_DIR}/ +fi + +rm -rf ./tmp + JOB_NAME=deepspeech`date +%Y%m%d%H%M%S` cp pcloud_train.sh ${DS2_PATH} paddlecloud submit \ --image wanghaoshuang/pcloud_ds2:latest-gpu-cudnn \ +-image bootstrapper:5000/wanghaoshuang/pcloud_ds2:latest-gpu-cudnn \ -jobname ${JOB_NAME} \ -cpu 4 \ -gpu 4 \ @@ -13,5 +58,5 @@ paddlecloud submit \ -pservers 1 \ -psmemory 10Gi \ -passes 1 \ --entry "sh pcloud_train.sh" \ -. +-entry "sh pcloud_train.sh ${CLOUD_DATA_DIR} ${CLOUD_MODEl_DIR}" \ +${DS2_PATH} diff --git a/cloud/pcloud_train.sh b/cloud/pcloud_train.sh index 385281ce..ebf73bbb 100644 --- a/cloud/pcloud_train.sh +++ b/cloud/pcloud_train.sh @@ -1,37 +1,35 @@ -DATA_PATH=/pfs/dlnel/public/dataset/speech/libri +DATA_PATH=$1 +MODEL_PATH=$2 #setted by user -TRAIN_MANI=${DATA_PATH}/manifest_pcloud.train +TRAIN_MANI=${DATA_PATH}/cloud.train.manifest #setted by user -DEV_MANI=${DATA_PATH}/manifest_pcloud.dev +DEV_MANI=${DATA_PATH}/cloud.test.manifest #setted by user -TRAIN_TAR=${DATA_PATH}/data.train.tar +TRAIN_TAR=${DATA_PATH}/cloud.train.tar #setted by user -DEV_TAR=${DATA_PATH}/data.dev.tar +DEV_TAR=${DATA_PATH}/cloud.test.tar #setted by user VOCAB_PATH=${DATA_PATH}/eng_vocab.txt #setted by user MEAN_STD_FILE=${DATA_PATH}/mean_std.npz -tar -xzf deepspeech.tar.gz -rm -rf ./cloud/data/* - # split train data for each pcloud node -python ./cloud/pcloud_split_data.py \ +python ./cloud/split_data.py \ --in_manifest_path=$TRAIN_MANI \ --data_tar_path=$TRAIN_TAR \ ---out_manifest_path='./cloud/data/train.mani' +--out_manifest_path='./local.train.manifest' # split dev data for each pcloud node -python pcloud_split_data.py \ +python ./cloud/split_data.py \ --in_manifest_path=$DEV_MANI \ --data_tar_path=$DEV_TAR \ ---out_manifest_path='./cloud/data/dev.mani' +--out_manifest_path='./local.test.manifest' python train.py \ --use_gpu=1 \ --trainer_count=4 \ --batch_size=256 \ --mean_std_filepath=$MEAN_STD_FILE \ ---train_manifest_path='./cloud/data/train.mani' \ ---dev_manifest_path='./cloud/data/dev.mani' \ +--train_manifest_path='./local.train.manifest' \ +--dev_manifest_path='./local.test.manifest' \ --vocab_filepath=$VOCAB_PATH \ diff --git a/cloud/pcloud_prepare_data.py b/cloud/prepare_data.py similarity index 95% rename from cloud/pcloud_prepare_data.py rename to cloud/prepare_data.py index 2ffdaf63..dc1e2d27 100644 --- a/cloud/pcloud_prepare_data.py +++ b/cloud/prepare_data.py @@ -25,12 +25,12 @@ parser.add_argument( help="Manifest of target data. (default: %(default)s)") parser.add_argument( "--out_tar_path", - default="./data/dev.tar", + default="./tmp/cloud.train.tar", type=str, help="Output tar file path. (default: %(default)s)") parser.add_argument( "--out_manifest_path", - default="./data/dev.mani", + default="./tmp/cloud.train.manifest", type=str, help="Manifest of output data. (default: %(default)s)") args = parser.parse_args() diff --git a/cloud/pcloud_split_data.py b/cloud/split_data.py similarity index 92% rename from cloud/pcloud_split_data.py rename to cloud/split_data.py index 8f98799a..78bf3174 100644 --- a/cloud/pcloud_split_data.py +++ b/cloud/split_data.py @@ -11,17 +11,17 @@ import argparse parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--in_manifest_path", - default='./cloud/data/dev.mani', + default='./cloud.train.manifest', type=str, help="Input manifest path. (default: %(default)s)") parser.add_argument( "--data_tar_path", - default='./cloud/data/dev.tar', + default='./cloud.train.tar', type=str, help="Data tar file path. (default: %(default)s)") parser.add_argument( "--out_manifest_path", - default='./cloud/data/dev.mani.split', + default='./local.train.manifest', type=str, help="Out manifest file path. (default: %(default)s)") args = parser.parse_args() diff --git a/pcloud_train.sh b/pcloud_train.sh index b13e23e9..ebf73bbb 100644 --- a/pcloud_train.sh +++ b/pcloud_train.sh @@ -1,37 +1,35 @@ -DATA_PATH=/pfs/dlnel/public/dataset/speech/libri +DATA_PATH=$1 +MODEL_PATH=$2 #setted by user -TRAIN_MANI=${DATA_PATH}/manifest_pcloud.train +TRAIN_MANI=${DATA_PATH}/cloud.train.manifest #setted by user -DEV_MANI=${DATA_PATH}/manifest_pcloud.dev +DEV_MANI=${DATA_PATH}/cloud.test.manifest #setted by user -TRAIN_TAR=${DATA_PATH}/data.train.tar +TRAIN_TAR=${DATA_PATH}/cloud.train.tar #setted by user -DEV_TAR=${DATA_PATH}/data.dev.tar +DEV_TAR=${DATA_PATH}/cloud.test.tar #setted by user VOCAB_PATH=${DATA_PATH}/eng_vocab.txt #setted by user MEAN_STD_FILE=${DATA_PATH}/mean_std.npz -tar -xzvf deepspeech.tar.gz -rm -rf ./cloud/data/* - # split train data for each pcloud node -python ./cloud/pcloud_split_data.py \ +python ./cloud/split_data.py \ --in_manifest_path=$TRAIN_MANI \ --data_tar_path=$TRAIN_TAR \ ---out_manifest_path='./cloud/data/train.mani' +--out_manifest_path='./local.train.manifest' # split dev data for each pcloud node -python pcloud_split_data.py \ +python ./cloud/split_data.py \ --in_manifest_path=$DEV_MANI \ --data_tar_path=$DEV_TAR \ ---out_manifest_path='./cloud/data/dev.mani' +--out_manifest_path='./local.test.manifest' python train.py \ --use_gpu=1 \ --trainer_count=4 \ --batch_size=256 \ --mean_std_filepath=$MEAN_STD_FILE \ ---train_manifest_path='./cloud/data/train.mani' \ ---dev_manifest_path='./cloud/data/dev.mani' \ +--train_manifest_path='./local.train.manifest' \ +--dev_manifest_path='./local.test.manifest' \ --vocab_filepath=$VOCAB_PATH \