parent
bbe47a4318
commit
9fa9a352ac
@ -0,0 +1,45 @@
|
||||
#DeepSpeech2 on paddle cloud
|
||||
|
||||
## Run DS2 by public data
|
||||
|
||||
**Step1: ** Make sure current dir is `models/deep_speech_2/cloud/`
|
||||
|
||||
**Step2:** Submit job by cmd: `sh pcloud_submit.sh`
|
||||
|
||||
```
|
||||
$ sh pcloud_submit.sh
|
||||
$ uploading: deepspeech.tar.gz...
|
||||
$ uploading: pcloud_prepare_data.py...
|
||||
$ uploading: pcloud_split_data.py...
|
||||
$ uploading: pcloud_submit.sh...
|
||||
$ uploading: pcloud_train.sh...
|
||||
$ deepspeech20170727130129 submited.
|
||||
```
|
||||
The we can get job name 'deepspeech20170727130129' at last line
|
||||
|
||||
**Step3:** Get logs from paddle cloud by cmd: `paddlecloud logs -n 10000 deepspeech20170727130129`.
|
||||
|
||||
```
|
||||
$ paddlecloud logs -n 10000 deepspeech20170727130129
|
||||
$ ==========================deepspeech20170727130129-trainer-6vk3m==========================
|
||||
label selector: paddle-job-pserver=deepspeech20170727130129, desired: 1
|
||||
running pod list: [('Running', '10.1.3.6')]
|
||||
label selector: paddle-job=deepspeech20170727130129, desired: 1
|
||||
running pod list: [('Running', '10.1.83.14')]
|
||||
Starting training job: /pfs/dlnel/home/yanxu05@baidu.com/jobs/deepspeech20170727130129, num_gradient_servers: 1, trainer_id: 0, version: v2
|
||||
I0727 05:01:42.969719 25 Util.cpp:166] commandline: --num_gradient_servers=1 --ports_num_for_sparse=1 --use_gpu=1 --trainer_id=0 --pservers=10.1.3.6 --trainer_count=4 --num_passes=1 --ports_num=1 --port=7164
|
||||
[INFO 2017-07-27 05:01:50,279 layers.py:2430] output for __conv_0__: c = 32, h = 81, w = 54, size = 139968
|
||||
[WARNING 2017-07-27 05:01:50,280 layers.py:2789] brelu is not recommend for batch normalization's activation, maybe the relu is better
|
||||
[INFO 2017-07-27 05:01:50,283 layers.py:2430] output for __conv_1__: c = 32, h = 41, w = 54, size = 70848
|
||||
[WARNING 2017-07-27 05:01:50,283 layers.py:2789] brelu is not recommend for batch normalization's activation, maybe the relu is better
|
||||
[WARNING 2017-07-27 05:01:50,287 layers.py:2789] is not recommend for batch normalization's activation, maybe the relu is better
|
||||
[WARNING 2017-07-27 05:01:50,291 layers.py:2789] is not recommend for batch normalization's activation, maybe the relu is better
|
||||
[WARNING 2017-07-27 05:01:50,295 layers.py:2789] is not recommend for batch normalization's activation, maybe the relu is better
|
||||
I0727 05:01:50.316176 25 MultiGradientMachine.cpp:99] numLogicalDevices=1 numThreads=4 numDevices=4
|
||||
I0727 05:01:50.454787 25 GradientMachine.cpp:85] Initing parameters..
|
||||
I0727 05:01:50.690007 25 GradientMachine.cpp:92] Init parameters done.
|
||||
```
|
||||
[More optins and cmd aoubt paddle cloud](https://github.com/PaddlePaddle/cloud/blob/develop/doc/usage_cn.md)
|
||||
|
||||
## Run DS2 by customize data
|
||||
TODO
|
@ -0,0 +1,50 @@
|
||||
"""
|
||||
This tool is used for splitting data into each node of
|
||||
paddle cloud by total trainer count and current trainer id.
|
||||
The meaning of trainer is a instance of k8s cluster.
|
||||
This script should be called in paddle cloud.
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--in_manifest_path",
|
||||
default='./cloud/data/dev.mani',
|
||||
type=str,
|
||||
help="Input manifest path. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--data_tar_path",
|
||||
default='./cloud/data/dev.tar',
|
||||
type=str,
|
||||
help="Data tar file path. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--out_manifest_path",
|
||||
default='./cloud/data/dev.mani.split',
|
||||
type=str,
|
||||
help="Out manifest file path. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def split_data(in_manifest, tar_path, out_manifest):
|
||||
with open("/trainer_id", "r") as f:
|
||||
trainer_id = int(f.readline()[:-1])
|
||||
with open("/trainer_count", "r") as f:
|
||||
trainer_count = int(f.readline()[:-1])
|
||||
|
||||
tar_path = os.path.abspath(tar_path)
|
||||
result = []
|
||||
for index, json_line in enumerate(open(in_manifest)):
|
||||
if (index % trainer_count) == trainer_id:
|
||||
json_data = json.loads(json_line)
|
||||
json_data['audio_filepath'] = "tar:%s#%s" % (
|
||||
tar_path, json_data['audio_filepath'])
|
||||
result.append("%s\n" % json.dumps(json_data))
|
||||
with open(out_manifest, 'w') as manifest:
|
||||
manifest.writelines(result)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
split_data(args.in_manifest_path, args.data_tar_path,
|
||||
args.out_manifest_path)
|
@ -0,0 +1,17 @@
|
||||
DS2_PATH=../
|
||||
tar -czf deepspeech.tar.gz ${DS2_PATH}
|
||||
JOB_NAME=deepspeech`date +%Y%m%d%H%M%S`
|
||||
cp pcloud_train.sh ${DS2_PATH}
|
||||
paddlecloud submit \
|
||||
-image wanghaoshuang/pcloud_ds2:latest-gpu-cudnn \
|
||||
-jobname ${JOB_NAME} \
|
||||
-cpu 4 \
|
||||
-gpu 4 \
|
||||
-memory 10Gi \
|
||||
-parallelism 1 \
|
||||
-pscpu 1 \
|
||||
-pservers 1 \
|
||||
-psmemory 10Gi \
|
||||
-passes 1 \
|
||||
-entry "sh pcloud_train.sh" \
|
||||
.
|
@ -0,0 +1,37 @@
|
||||
DATA_PATH=/pfs/dlnel/public/dataset/speech/libri
|
||||
#setted by user
|
||||
TRAIN_MANI=${DATA_PATH}/manifest_pcloud.train
|
||||
#setted by user
|
||||
DEV_MANI=${DATA_PATH}/manifest_pcloud.dev
|
||||
#setted by user
|
||||
TRAIN_TAR=${DATA_PATH}/data.train.tar
|
||||
#setted by user
|
||||
DEV_TAR=${DATA_PATH}/data.dev.tar
|
||||
#setted by user
|
||||
VOCAB_PATH=${DATA_PATH}/eng_vocab.txt
|
||||
#setted by user
|
||||
MEAN_STD_FILE=${DATA_PATH}/mean_std.npz
|
||||
|
||||
tar -xzf deepspeech.tar.gz
|
||||
rm -rf ./cloud/data/*
|
||||
|
||||
# split train data for each pcloud node
|
||||
python ./cloud/pcloud_split_data.py \
|
||||
--in_manifest_path=$TRAIN_MANI \
|
||||
--data_tar_path=$TRAIN_TAR \
|
||||
--out_manifest_path='./cloud/data/train.mani'
|
||||
|
||||
# split dev data for each pcloud node
|
||||
python pcloud_split_data.py \
|
||||
--in_manifest_path=$DEV_MANI \
|
||||
--data_tar_path=$DEV_TAR \
|
||||
--out_manifest_path='./cloud/data/dev.mani'
|
||||
|
||||
python train.py \
|
||||
--use_gpu=1 \
|
||||
--trainer_count=4 \
|
||||
--batch_size=256 \
|
||||
--mean_std_filepath=$MEAN_STD_FILE \
|
||||
--train_manifest_path='./cloud/data/train.mani' \
|
||||
--dev_manifest_path='./cloud/data/dev.mani' \
|
||||
--vocab_filepath=$VOCAB_PATH \
|
@ -1,47 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
|
||||
|
||||
def split_data(inManifest, tar_path, outManifest):
|
||||
trainer_id = 1
|
||||
trainer_count = 2
|
||||
#with open("/trainer_id", "r") as f:
|
||||
# trainer_id = int(f.readline()[:-1])
|
||||
#with open("/trainer_count", "r") as f:
|
||||
# trainer_count = int(f.readline()[:-1])
|
||||
|
||||
tarPath = os.path.abspath(tar_path)
|
||||
result = []
|
||||
for index, json_line in enumerate(open(inManifest)):
|
||||
if (index % trainer_count) == trainer_id:
|
||||
json_data = json.loads(json_line)
|
||||
json_data['audio_filepath'] = "tar:%s#%s" % (
|
||||
tarPath, json_data['audio_filepath'])
|
||||
result.append("%s\n" % json.dumps(json_data))
|
||||
with open(outManifest, 'w') as manifest:
|
||||
manifest.writelines(result)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
|
||||
parser.add_argument(
|
||||
"--in_manifest_path",
|
||||
default='datasets/dev.mani',
|
||||
type=str,
|
||||
help="Input manifest path. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--data_tar_path",
|
||||
default='datasets/dev.tar',
|
||||
type=str,
|
||||
help="Data tar file path. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--out_manifest_path",
|
||||
default='datasets/dev.mani.split',
|
||||
type=str,
|
||||
help="Out manifest file path. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
split_data(args.in_manifest_path, args.data_tar_path,
|
||||
args.out_manifest_path)
|
@ -1,13 +0,0 @@
|
||||
paddlecloud submit \
|
||||
-image wanghaoshuang/pcloud_ds2 \
|
||||
-jobname ds23 \
|
||||
-cpu 1 \
|
||||
-gpu 0 \
|
||||
-memory 10Gi \
|
||||
-parallelism 1 \
|
||||
-pscpu 1 \
|
||||
-pservers 1 \
|
||||
-psmemory 10Gi \
|
||||
-passes 1 \
|
||||
-entry "sh pcloud_train.sh" \
|
||||
./deep_speech_2
|
@ -1,32 +1,37 @@
|
||||
DATA_PATH=/pfs/dlnel/public/dataset/speech/libri
|
||||
#setted by user
|
||||
TRAIN_MANI='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.mani'
|
||||
TRAIN_MANI=${DATA_PATH}/manifest_pcloud.train
|
||||
#setted by user
|
||||
DEV_MANI='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.mani'
|
||||
DEV_MANI=${DATA_PATH}/manifest_pcloud.dev
|
||||
#setted by user
|
||||
TRAIN_TAR='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.tar'
|
||||
TRAIN_TAR=${DATA_PATH}/data.train.tar
|
||||
#setted by user
|
||||
DEV_TAR='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.tar'
|
||||
DEV_TAR=${DATA_PATH}/data.dev.tar
|
||||
#setted by user
|
||||
VOCAB_PATH='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/eng_vocab.txt'
|
||||
VOCAB_PATH=${DATA_PATH}/eng_vocab.txt
|
||||
#setted by user
|
||||
MEAN_STD_FILE='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/mean_std.npz'
|
||||
MEAN_STD_FILE=${DATA_PATH}/mean_std.npz
|
||||
|
||||
tar -xzvf deepspeech.tar.gz
|
||||
rm -rf ./cloud/data/*
|
||||
|
||||
# split train data for each pcloud node
|
||||
python pcloud_split_data.py \
|
||||
python ./cloud/pcloud_split_data.py \
|
||||
--in_manifest_path=$TRAIN_MANI \
|
||||
--data_tar_path=$TRAIN_TAR \
|
||||
--out_manifest_path='./train.mani'
|
||||
--out_manifest_path='./cloud/data/train.mani'
|
||||
|
||||
# split dev data for each pcloud node
|
||||
python pcloud_split_data.py \
|
||||
--in_manifest_path=$DEV_MANI \
|
||||
--data_tar_path=$DEV_TAR \
|
||||
--out_manifest_path='./dev.mani'
|
||||
--out_manifest_path='./cloud/data/dev.mani'
|
||||
|
||||
python train.py \
|
||||
--use_gpu=0 \
|
||||
--use_gpu=1 \
|
||||
--trainer_count=4 \
|
||||
--batch_size=2 \
|
||||
--batch_size=256 \
|
||||
--mean_std_filepath=$MEAN_STD_FILE \
|
||||
--train_manifest_path='./train.mani' \
|
||||
--dev_manifest_path='./dev.mani' \
|
||||
--train_manifest_path='./cloud/data/train.mani' \
|
||||
--dev_manifest_path='./cloud/data/dev.mani' \
|
||||
--vocab_filepath=$VOCAB_PATH \
|
||||
|
Loading…
Reference in new issue