parent
d2bdd254a3
commit
d74f4ff3f5
@ -1,17 +0,0 @@
|
||||
"""Set up paths for DS2"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
|
||||
def add_path(path):
|
||||
if path not in sys.path:
|
||||
sys.path.insert(0, path)
|
||||
|
||||
|
||||
this_dir = os.path.dirname(__file__)
|
||||
proj_path = os.path.join(this_dir, '..')
|
||||
add_path(proj_path)
|
@ -1,29 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
TRAIN_MANIFEST="cloud/cloud_manifests/cloud.manifest.train"
|
||||
DEV_MANIFEST="cloud/cloud_manifests/cloud.manifest.dev"
|
||||
CLOUD_MODEL_DIR="./checkpoints"
|
||||
BATCH_SIZE=512
|
||||
NUM_GPU=8
|
||||
NUM_NODE=1
|
||||
IS_LOCAL="True"
|
||||
|
||||
JOB_NAME=deepspeech-`date +%Y%m%d%H%M%S`
|
||||
DS2_PATH=${PWD%/*}
|
||||
cp -f pcloud_train.sh ${DS2_PATH}
|
||||
|
||||
paddlecloud submit \
|
||||
-image bootstrapper:5000/paddlepaddle/pcloud_ds2:latest \
|
||||
-jobname ${JOB_NAME} \
|
||||
-cpu ${NUM_GPU} \
|
||||
-gpu ${NUM_GPU} \
|
||||
-memory 64Gi \
|
||||
-parallelism ${NUM_NODE} \
|
||||
-pscpu 1 \
|
||||
-pservers 1 \
|
||||
-psmemory 64Gi \
|
||||
-passes 1 \
|
||||
-entry "sh pcloud_train.sh ${TRAIN_MANIFEST} ${DEV_MANIFEST} ${CLOUD_MODEL_DIR} ${NUM_GPU} ${BATCH_SIZE} ${IS_LOCAL}" \
|
||||
${DS2_PATH}
|
||||
|
||||
rm ${DS2_PATH}/pcloud_train.sh
|
@ -1,46 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
TRAIN_MANIFEST=$1
|
||||
DEV_MANIFEST=$2
|
||||
MODEL_PATH=$3
|
||||
NUM_GPU=$4
|
||||
BATCH_SIZE=$5
|
||||
IS_LOCAL=$6
|
||||
|
||||
python ./cloud/split_data.py \
|
||||
--in_manifest_path=${TRAIN_MANIFEST} \
|
||||
--out_manifest_path='/local.manifest.train'
|
||||
|
||||
python ./cloud/split_data.py \
|
||||
--in_manifest_path=${DEV_MANIFEST} \
|
||||
--out_manifest_path='/local.manifest.dev'
|
||||
|
||||
mkdir ./logs
|
||||
|
||||
python -u train.py \
|
||||
--batch_size=${BATCH_SIZE} \
|
||||
--trainer_count=${NUM_GPU} \
|
||||
--num_passes=200 \
|
||||
--num_proc_data=${NUM_GPU} \
|
||||
--num_conv_layers=2 \
|
||||
--num_rnn_layers=3 \
|
||||
--rnn_layer_size=2048 \
|
||||
--num_iter_print=100 \
|
||||
--learning_rate=5e-4 \
|
||||
--max_duration=27.0 \
|
||||
--min_duration=0.0 \
|
||||
--use_sortagrad=True \
|
||||
--use_gru=False \
|
||||
--use_gpu=True \
|
||||
--is_local=${IS_LOCAL} \
|
||||
--share_rnn_weights=True \
|
||||
--train_manifest='/local.manifest.train' \
|
||||
--dev_manifest='/local.manifest.dev' \
|
||||
--mean_std_path='data/librispeech/mean_std.npz' \
|
||||
--vocab_path='data/librispeech/vocab.txt' \
|
||||
--output_model_dir='./checkpoints' \
|
||||
--output_model_dir=${MODEL_PATH} \
|
||||
--augment_conf_path='conf/augmentation.config' \
|
||||
--specgram_type='linear' \
|
||||
--shuffle_method='batch_shuffle_clipped' \
|
||||
2>&1 | tee ./logs/train.log
|
@ -1,22 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
mkdir cloud_manifests
|
||||
|
||||
IN_MANIFESTS="../data/librispeech/manifest.train ../data/librispeech/manifest.dev-clean ../data/librispeech/manifest.test-clean"
|
||||
OUT_MANIFESTS="cloud_manifests/cloud.manifest.train cloud_manifests/cloud.manifest.dev cloud_manifests/cloud.manifest.test"
|
||||
CLOUD_DATA_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/data/librispeech"
|
||||
NUM_SHARDS=50
|
||||
|
||||
python upload_data.py \
|
||||
--in_manifest_paths ${IN_MANIFESTS} \
|
||||
--out_manifest_paths ${OUT_MANIFESTS} \
|
||||
--cloud_data_dir ${CLOUD_DATA_DIR} \
|
||||
--num_shards ${NUM_SHARDS}
|
||||
|
||||
if [ $? -ne 0 ]
|
||||
then
|
||||
echo "Upload Data Failed!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "All Done."
|
@ -1,41 +0,0 @@
|
||||
"""This tool is used for splitting data into each node of
|
||||
paddlecloud. This script should be called in paddlecloud.
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--in_manifest_path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Input manifest path for all nodes.")
|
||||
parser.add_argument(
|
||||
"--out_manifest_path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Output manifest file path for current node.")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def split_data(in_manifest_path, out_manifest_path):
|
||||
with open("/trainer_id", "r") as f:
|
||||
trainer_id = int(f.readline()[:-1])
|
||||
with open("/trainer_count", "r") as f:
|
||||
trainer_count = int(f.readline()[:-1])
|
||||
|
||||
out_manifest = []
|
||||
for index, json_line in enumerate(open(in_manifest_path, 'r')):
|
||||
if (index % trainer_count) == trainer_id:
|
||||
out_manifest.append("%s\n" % json_line.strip())
|
||||
with open(out_manifest_path, 'w') as f:
|
||||
f.writelines(out_manifest)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
split_data(args.in_manifest_path, args.out_manifest_path)
|
@ -1,129 +0,0 @@
|
||||
"""This script is for uploading data for DeepSpeech2 training on paddlecloud.
|
||||
|
||||
Steps:
|
||||
1. Read original manifests and extract local sound files.
|
||||
2. Tar all local sound files into multiple tar files and upload them.
|
||||
3. Modify original manifests with updated paths in cloud filesystem.
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import json
|
||||
import os
|
||||
import tarfile
|
||||
import sys
|
||||
import argparse
|
||||
import shutil
|
||||
from subprocess import call
|
||||
import _init_paths
|
||||
from data_utils.utils import read_manifest
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--in_manifest_paths",
|
||||
default=[
|
||||
"../datasets/manifest.train", "../datasets/manifest.dev",
|
||||
"../datasets/manifest.test"
|
||||
],
|
||||
type=str,
|
||||
nargs='+',
|
||||
help="Local filepaths of input manifests to load, pack and upload."
|
||||
"(default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--out_manifest_paths",
|
||||
default=[
|
||||
"./cloud.manifest.train", "./cloud.manifest.dev",
|
||||
"./cloud.manifest.test"
|
||||
],
|
||||
type=str,
|
||||
nargs='+',
|
||||
help="Local filepaths of modified manifests to write to. "
|
||||
"(default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--cloud_data_dir",
|
||||
required=True,
|
||||
type=str,
|
||||
help="Destination directory on paddlecloud to upload data to.")
|
||||
parser.add_argument(
|
||||
"--num_shards",
|
||||
default=10,
|
||||
type=int,
|
||||
help="Number of parts to split data to. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--local_tmp_dir",
|
||||
default="./tmp/",
|
||||
type=str,
|
||||
help="Local directory for storing temporary data. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def upload_data(in_manifest_path_list, out_manifest_path_list, local_tmp_dir,
|
||||
upload_tar_dir, num_shards):
|
||||
"""Extract and pack sound files listed in the manifest files into multple
|
||||
tar files and upload them to padldecloud. Besides, generate new manifest
|
||||
files with updated paths in paddlecloud.
|
||||
"""
|
||||
# compute total audio number
|
||||
total_line = 0
|
||||
for manifest_path in in_manifest_path_list:
|
||||
with open(manifest_path, 'r') as f:
|
||||
total_line += len(f.readlines())
|
||||
line_per_tar = (total_line // num_shards) + 1
|
||||
|
||||
# pack and upload shard by shard
|
||||
line_count, tar_file = 0, None
|
||||
for manifest_path, out_manifest_path in zip(in_manifest_path_list,
|
||||
out_manifest_path_list):
|
||||
manifest = read_manifest(manifest_path)
|
||||
out_manifest = []
|
||||
for json_data in manifest:
|
||||
sound_filepath = json_data['audio_filepath']
|
||||
sound_filename = os.path.basename(sound_filepath)
|
||||
if line_count % line_per_tar == 0:
|
||||
if tar_file != None:
|
||||
tar_file.close()
|
||||
pcloud_cp(tar_path, upload_tar_dir)
|
||||
os.remove(tar_path)
|
||||
tar_name = 'part-%s-of-%s.tar' % (
|
||||
str(line_count // line_per_tar).zfill(5),
|
||||
str(num_shards).zfill(5))
|
||||
tar_path = os.path.join(local_tmp_dir, tar_name)
|
||||
tar_file = tarfile.open(tar_path, 'w')
|
||||
tar_file.add(sound_filepath, arcname=sound_filename)
|
||||
line_count += 1
|
||||
json_data['audio_filepath'] = "tar:%s#%s" % (
|
||||
os.path.join(upload_tar_dir, tar_name), sound_filename)
|
||||
out_manifest.append("%s\n" % json.dumps(json_data))
|
||||
with open(out_manifest_path, 'w') as f:
|
||||
f.writelines(out_manifest)
|
||||
pcloud_cp(out_manifest_path, upload_tar_dir)
|
||||
tar_file.close()
|
||||
pcloud_cp(tar_path, upload_tar_dir)
|
||||
os.remove(tar_path)
|
||||
|
||||
|
||||
def pcloud_mkdir(dir):
|
||||
"""Make directory in PaddleCloud filesystem.
|
||||
"""
|
||||
if call(['paddlecloud', 'mkdir', dir]) != 0:
|
||||
raise IOError("PaddleCloud mkdir failed: %s." % dir)
|
||||
|
||||
|
||||
def pcloud_cp(src, dst):
|
||||
"""Copy src from local filesytem to dst in PaddleCloud filesystem,
|
||||
or downlowd src from PaddleCloud filesystem to dst in local filesystem.
|
||||
"""
|
||||
if call(['paddlecloud', 'cp', src, dst]) != 0:
|
||||
raise IOError("PaddleCloud cp failed: from [%s] to [%s]." % (src, dst))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if not os.path.exists(args.local_tmp_dir):
|
||||
os.makedirs(args.local_tmp_dir)
|
||||
pcloud_mkdir(args.cloud_data_dir)
|
||||
|
||||
upload_data(args.in_manifest_paths, args.out_manifest_paths,
|
||||
args.local_tmp_dir, args.cloud_data_dir, args.num_shards)
|
||||
|
||||
shutil.rmtree(args.local_tmp_dir)
|
Before Width: | Height: | Size: 153 KiB After Width: | Height: | Size: 206 KiB |
@ -1,4 +1,4 @@
|
||||
scipy==0.13.1
|
||||
scipy==1.2.1
|
||||
resampy==0.1.5
|
||||
SoundFile==0.9.0.post1
|
||||
python_speech_features
|
||||
|
Loading…
Reference in new issue