From 9fa9a352ac46c2547fcedfa9def201e7ed06d760 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Thu, 27 Jul 2017 13:53:37 +0800 Subject: [PATCH] Refine submitting scripts for deepspeech2 on paddle cloud. --- cloud/README.md | 45 +++++++++++++++++ .../pcloud_prepare_data.py | 32 ++++++++---- cloud/pcloud_split_data.py | 50 +++++++++++++++++++ cloud/pcloud_submit.sh | 17 +++++++ cloud/pcloud_train.sh | 37 ++++++++++++++ data_utils/data.py | 13 +++-- pcloud_split_data.py | 47 ----------------- pcloud_submit.sh | 13 ----- pcloud_train.sh | 31 +++++++----- 9 files changed, 197 insertions(+), 88 deletions(-) create mode 100644 cloud/README.md rename datasets/librispeech/pcloud_data.py => cloud/pcloud_prepare_data.py (61%) create mode 100644 cloud/pcloud_split_data.py create mode 100644 cloud/pcloud_submit.sh create mode 100644 cloud/pcloud_train.sh delete mode 100644 pcloud_split_data.py delete mode 100644 pcloud_submit.sh diff --git a/cloud/README.md b/cloud/README.md new file mode 100644 index 00000000..91a1d52a --- /dev/null +++ b/cloud/README.md @@ -0,0 +1,45 @@ +#DeepSpeech2 on paddle cloud + +## Run DS2 by public data + +**Step1: ** Make sure current dir is `models/deep_speech_2/cloud/` + +**Step2:** Submit job by cmd: `sh pcloud_submit.sh` + +``` +$ sh pcloud_submit.sh +$ uploading: deepspeech.tar.gz... +$ uploading: pcloud_prepare_data.py... +$ uploading: pcloud_split_data.py... +$ uploading: pcloud_submit.sh... +$ uploading: pcloud_train.sh... +$ deepspeech20170727130129 submited. +``` +The we can get job name 'deepspeech20170727130129' at last line + +**Step3:** Get logs from paddle cloud by cmd: `paddlecloud logs -n 10000 deepspeech20170727130129`. + +``` +$ paddlecloud logs -n 10000 deepspeech20170727130129 +$ ==========================deepspeech20170727130129-trainer-6vk3m========================== +label selector: paddle-job-pserver=deepspeech20170727130129, desired: 1 +running pod list: [('Running', '10.1.3.6')] +label selector: paddle-job=deepspeech20170727130129, desired: 1 +running pod list: [('Running', '10.1.83.14')] +Starting training job: /pfs/dlnel/home/yanxu05@baidu.com/jobs/deepspeech20170727130129, num_gradient_servers: 1, trainer_id: 0, version: v2 +I0727 05:01:42.969719 25 Util.cpp:166] commandline: --num_gradient_servers=1 --ports_num_for_sparse=1 --use_gpu=1 --trainer_id=0 --pservers=10.1.3.6 --trainer_count=4 --num_passes=1 --ports_num=1 --port=7164 +[INFO 2017-07-27 05:01:50,279 layers.py:2430] output for __conv_0__: c = 32, h = 81, w = 54, size = 139968 +[WARNING 2017-07-27 05:01:50,280 layers.py:2789] brelu is not recommend for batch normalization's activation, maybe the relu is better +[INFO 2017-07-27 05:01:50,283 layers.py:2430] output for __conv_1__: c = 32, h = 41, w = 54, size = 70848 +[WARNING 2017-07-27 05:01:50,283 layers.py:2789] brelu is not recommend for batch normalization's activation, maybe the relu is better +[WARNING 2017-07-27 05:01:50,287 layers.py:2789] is not recommend for batch normalization's activation, maybe the relu is better +[WARNING 2017-07-27 05:01:50,291 layers.py:2789] is not recommend for batch normalization's activation, maybe the relu is better +[WARNING 2017-07-27 05:01:50,295 layers.py:2789] is not recommend for batch normalization's activation, maybe the relu is better +I0727 05:01:50.316176 25 MultiGradientMachine.cpp:99] numLogicalDevices=1 numThreads=4 numDevices=4 +I0727 05:01:50.454787 25 GradientMachine.cpp:85] Initing parameters.. +I0727 05:01:50.690007 25 GradientMachine.cpp:92] Init parameters done. +``` +[More optins and cmd aoubt paddle cloud](https://github.com/PaddlePaddle/cloud/blob/develop/doc/usage_cn.md) + +## Run DS2 by customize data +TODO diff --git a/datasets/librispeech/pcloud_data.py b/cloud/pcloud_prepare_data.py similarity index 61% rename from datasets/librispeech/pcloud_data.py rename to cloud/pcloud_prepare_data.py index 91400114..2ffdaf63 100644 --- a/datasets/librispeech/pcloud_data.py +++ b/cloud/pcloud_prepare_data.py @@ -1,23 +1,36 @@ +""" +This tool is used for preparing data for DeepSpeech2 trainning on paddle cloud. + +Steps: +1. Read original manifest and get the local path of sound files. +2. Tar all local sound files into one tar file. +3. Modify original manifest to remove the local path information. + +Finally, we will get a tar file and a manifest with sound file name, duration +and text. +""" import json import os import tarfile import sys import argparse +sys.path.append('../') +from data_utils.utils import read_manifest parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--manifest_path", - default="/manifest.train", + default="../datasets/manifest.train", type=str, help="Manifest of target data. (default: %(default)s)") parser.add_argument( "--out_tar_path", - default="/dev.tar", + default="./data/dev.tar", type=str, help="Output tar file path. (default: %(default)s)") parser.add_argument( "--out_manifest_path", - default="/dev.mani", + default="./data/dev.mani", type=str, help="Manifest of output data. (default: %(default)s)") args = parser.parse_args() @@ -29,19 +42,16 @@ def gen_pcloud_data(manifest_path, out_tar_path, out_manifest_path): 2. Generate a new manifest for output tar file ''' out_tar = tarfile.open(out_tar_path, 'w') - manifest = [] - for json_line in open(manifest_path): - try: - json_data = json.loads(json_line) - except Exception as e: - raise IOError("Error reading manifest: %s" % str(e)) + manifest = read_manifest(manifest_path) + results = [] + for json_data in manifest: sound_file = json_data['audio_filepath'] filename = os.path.basename(sound_file) out_tar.add(sound_file, arcname=filename) json_data['audio_filepath'] = filename - manifest.append("%s\n" % json.dumps(json_data)) + results.append("%s\n" % json.dumps(json_data)) with open(out_manifest_path, 'w') as out_manifest: - out_manifest.writelines(manifest) + out_manifest.writelines(results) out_manifest.close() out_tar.close() diff --git a/cloud/pcloud_split_data.py b/cloud/pcloud_split_data.py new file mode 100644 index 00000000..8f98799a --- /dev/null +++ b/cloud/pcloud_split_data.py @@ -0,0 +1,50 @@ +""" +This tool is used for splitting data into each node of +paddle cloud by total trainer count and current trainer id. +The meaning of trainer is a instance of k8s cluster. +This script should be called in paddle cloud. +""" +import os +import json +import argparse + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--in_manifest_path", + default='./cloud/data/dev.mani', + type=str, + help="Input manifest path. (default: %(default)s)") +parser.add_argument( + "--data_tar_path", + default='./cloud/data/dev.tar', + type=str, + help="Data tar file path. (default: %(default)s)") +parser.add_argument( + "--out_manifest_path", + default='./cloud/data/dev.mani.split', + type=str, + help="Out manifest file path. (default: %(default)s)") +args = parser.parse_args() + + +def split_data(in_manifest, tar_path, out_manifest): + with open("/trainer_id", "r") as f: + trainer_id = int(f.readline()[:-1]) + with open("/trainer_count", "r") as f: + trainer_count = int(f.readline()[:-1]) + + tar_path = os.path.abspath(tar_path) + result = [] + for index, json_line in enumerate(open(in_manifest)): + if (index % trainer_count) == trainer_id: + json_data = json.loads(json_line) + json_data['audio_filepath'] = "tar:%s#%s" % ( + tar_path, json_data['audio_filepath']) + result.append("%s\n" % json.dumps(json_data)) + with open(out_manifest, 'w') as manifest: + manifest.writelines(result) + + +if __name__ == '__main__': + split_data(args.in_manifest_path, args.data_tar_path, + args.out_manifest_path) diff --git a/cloud/pcloud_submit.sh b/cloud/pcloud_submit.sh new file mode 100644 index 00000000..5d053501 --- /dev/null +++ b/cloud/pcloud_submit.sh @@ -0,0 +1,17 @@ +DS2_PATH=../ +tar -czf deepspeech.tar.gz ${DS2_PATH} +JOB_NAME=deepspeech`date +%Y%m%d%H%M%S` +cp pcloud_train.sh ${DS2_PATH} +paddlecloud submit \ +-image wanghaoshuang/pcloud_ds2:latest-gpu-cudnn \ +-jobname ${JOB_NAME} \ +-cpu 4 \ +-gpu 4 \ +-memory 10Gi \ +-parallelism 1 \ +-pscpu 1 \ +-pservers 1 \ +-psmemory 10Gi \ +-passes 1 \ +-entry "sh pcloud_train.sh" \ +. diff --git a/cloud/pcloud_train.sh b/cloud/pcloud_train.sh new file mode 100644 index 00000000..385281ce --- /dev/null +++ b/cloud/pcloud_train.sh @@ -0,0 +1,37 @@ +DATA_PATH=/pfs/dlnel/public/dataset/speech/libri +#setted by user +TRAIN_MANI=${DATA_PATH}/manifest_pcloud.train +#setted by user +DEV_MANI=${DATA_PATH}/manifest_pcloud.dev +#setted by user +TRAIN_TAR=${DATA_PATH}/data.train.tar +#setted by user +DEV_TAR=${DATA_PATH}/data.dev.tar +#setted by user +VOCAB_PATH=${DATA_PATH}/eng_vocab.txt +#setted by user +MEAN_STD_FILE=${DATA_PATH}/mean_std.npz + +tar -xzf deepspeech.tar.gz +rm -rf ./cloud/data/* + +# split train data for each pcloud node +python ./cloud/pcloud_split_data.py \ +--in_manifest_path=$TRAIN_MANI \ +--data_tar_path=$TRAIN_TAR \ +--out_manifest_path='./cloud/data/train.mani' + +# split dev data for each pcloud node +python pcloud_split_data.py \ +--in_manifest_path=$DEV_MANI \ +--data_tar_path=$DEV_TAR \ +--out_manifest_path='./cloud/data/dev.mani' + +python train.py \ +--use_gpu=1 \ +--trainer_count=4 \ +--batch_size=256 \ +--mean_std_filepath=$MEAN_STD_FILE \ +--train_manifest_path='./cloud/data/train.mani' \ +--dev_manifest_path='./cloud/data/dev.mani' \ +--vocab_filepath=$VOCAB_PATH \ diff --git a/data_utils/data.py b/data_utils/data.py index e1fa4747..5a5fa51b 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -6,11 +6,11 @@ from __future__ import division from __future__ import print_function import random -import numpy as np +import tarfile import multiprocessing -from threading import local +import numpy as np import paddle.v2 as paddle -import tarfile +from threading import local from data_utils import utils from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.featurizer.speech_featurizer import SpeechFeaturizer @@ -52,6 +52,9 @@ class DataGenerator(object): :types max_freq: None|float :param specgram_type: Specgram feature type. Options: 'linear'. :type specgram_type: str + :param use_dB_normalization: Whether to normalize the audio to -20 dB + before extracting the features. + :type use_dB_normalization: bool :param num_threads: Number of CPU threads for processing data. :type num_threads: int :param random_seed: Random seed. @@ -68,6 +71,7 @@ class DataGenerator(object): window_ms=20.0, max_freq=None, specgram_type='linear', + use_dB_normalization=True, num_threads=multiprocessing.cpu_count(), random_seed=0): self._max_duration = max_duration @@ -80,7 +84,8 @@ class DataGenerator(object): specgram_type=specgram_type, stride_ms=stride_ms, window_ms=window_ms, - max_freq=max_freq) + max_freq=max_freq, + use_dB_normalization=use_dB_normalization) self._num_threads = num_threads self._rng = random.Random(random_seed) self._epoch = 0 diff --git a/pcloud_split_data.py b/pcloud_split_data.py deleted file mode 100644 index bf35383a..00000000 --- a/pcloud_split_data.py +++ /dev/null @@ -1,47 +0,0 @@ -import os -import json -import argparse - - -def split_data(inManifest, tar_path, outManifest): - trainer_id = 1 - trainer_count = 2 - #with open("/trainer_id", "r") as f: - # trainer_id = int(f.readline()[:-1]) - #with open("/trainer_count", "r") as f: - # trainer_count = int(f.readline()[:-1]) - - tarPath = os.path.abspath(tar_path) - result = [] - for index, json_line in enumerate(open(inManifest)): - if (index % trainer_count) == trainer_id: - json_data = json.loads(json_line) - json_data['audio_filepath'] = "tar:%s#%s" % ( - tarPath, json_data['audio_filepath']) - result.append("%s\n" % json.dumps(json_data)) - with open(outManifest, 'w') as manifest: - manifest.writelines(result) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description=__doc__) - - parser.add_argument( - "--in_manifest_path", - default='datasets/dev.mani', - type=str, - help="Input manifest path. (default: %(default)s)") - parser.add_argument( - "--data_tar_path", - default='datasets/dev.tar', - type=str, - help="Data tar file path. (default: %(default)s)") - parser.add_argument( - "--out_manifest_path", - default='datasets/dev.mani.split', - type=str, - help="Out manifest file path. (default: %(default)s)") - args = parser.parse_args() - - split_data(args.in_manifest_path, args.data_tar_path, - args.out_manifest_path) diff --git a/pcloud_submit.sh b/pcloud_submit.sh deleted file mode 100644 index 06e65110..00000000 --- a/pcloud_submit.sh +++ /dev/null @@ -1,13 +0,0 @@ -paddlecloud submit \ --image wanghaoshuang/pcloud_ds2 \ --jobname ds23 \ --cpu 1 \ --gpu 0 \ --memory 10Gi \ --parallelism 1 \ --pscpu 1 \ --pservers 1 \ --psmemory 10Gi \ --passes 1 \ --entry "sh pcloud_train.sh" \ -./deep_speech_2 diff --git a/pcloud_train.sh b/pcloud_train.sh index fb6cbb9e..b13e23e9 100644 --- a/pcloud_train.sh +++ b/pcloud_train.sh @@ -1,32 +1,37 @@ +DATA_PATH=/pfs/dlnel/public/dataset/speech/libri #setted by user -TRAIN_MANI='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.mani' +TRAIN_MANI=${DATA_PATH}/manifest_pcloud.train #setted by user -DEV_MANI='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.mani' +DEV_MANI=${DATA_PATH}/manifest_pcloud.dev #setted by user -TRAIN_TAR='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.tar' +TRAIN_TAR=${DATA_PATH}/data.train.tar #setted by user -DEV_TAR='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/demo.tar' +DEV_TAR=${DATA_PATH}/data.dev.tar #setted by user -VOCAB_PATH='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/eng_vocab.txt' +VOCAB_PATH=${DATA_PATH}/eng_vocab.txt #setted by user -MEAN_STD_FILE='/pfs/dlnel/home/yanxu05@baidu.com/wanghaoshuang/data/ds2_data/mean_std.npz' +MEAN_STD_FILE=${DATA_PATH}/mean_std.npz + +tar -xzvf deepspeech.tar.gz +rm -rf ./cloud/data/* # split train data for each pcloud node -python pcloud_split_data.py \ +python ./cloud/pcloud_split_data.py \ --in_manifest_path=$TRAIN_MANI \ --data_tar_path=$TRAIN_TAR \ ---out_manifest_path='./train.mani' +--out_manifest_path='./cloud/data/train.mani' + # split dev data for each pcloud node python pcloud_split_data.py \ --in_manifest_path=$DEV_MANI \ --data_tar_path=$DEV_TAR \ ---out_manifest_path='./dev.mani' +--out_manifest_path='./cloud/data/dev.mani' python train.py \ ---use_gpu=0 \ +--use_gpu=1 \ --trainer_count=4 \ ---batch_size=2 \ +--batch_size=256 \ --mean_std_filepath=$MEAN_STD_FILE \ ---train_manifest_path='./train.mani' \ ---dev_manifest_path='./dev.mani' \ +--train_manifest_path='./cloud/data/train.mani' \ +--dev_manifest_path='./cloud/data/dev.mani' \ --vocab_filepath=$VOCAB_PATH \