parent
d2bdd254a3
commit
d74f4ff3f5
@ -1,17 +0,0 @@
|
|||||||
"""Set up paths for DS2"""
|
|
||||||
from __future__ import absolute_import
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import os.path
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
def add_path(path):
|
|
||||||
if path not in sys.path:
|
|
||||||
sys.path.insert(0, path)
|
|
||||||
|
|
||||||
|
|
||||||
this_dir = os.path.dirname(__file__)
|
|
||||||
proj_path = os.path.join(this_dir, '..')
|
|
||||||
add_path(proj_path)
|
|
@ -1,29 +0,0 @@
|
|||||||
#! /usr/bin/env bash
|
|
||||||
|
|
||||||
TRAIN_MANIFEST="cloud/cloud_manifests/cloud.manifest.train"
|
|
||||||
DEV_MANIFEST="cloud/cloud_manifests/cloud.manifest.dev"
|
|
||||||
CLOUD_MODEL_DIR="./checkpoints"
|
|
||||||
BATCH_SIZE=512
|
|
||||||
NUM_GPU=8
|
|
||||||
NUM_NODE=1
|
|
||||||
IS_LOCAL="True"
|
|
||||||
|
|
||||||
JOB_NAME=deepspeech-`date +%Y%m%d%H%M%S`
|
|
||||||
DS2_PATH=${PWD%/*}
|
|
||||||
cp -f pcloud_train.sh ${DS2_PATH}
|
|
||||||
|
|
||||||
paddlecloud submit \
|
|
||||||
-image bootstrapper:5000/paddlepaddle/pcloud_ds2:latest \
|
|
||||||
-jobname ${JOB_NAME} \
|
|
||||||
-cpu ${NUM_GPU} \
|
|
||||||
-gpu ${NUM_GPU} \
|
|
||||||
-memory 64Gi \
|
|
||||||
-parallelism ${NUM_NODE} \
|
|
||||||
-pscpu 1 \
|
|
||||||
-pservers 1 \
|
|
||||||
-psmemory 64Gi \
|
|
||||||
-passes 1 \
|
|
||||||
-entry "sh pcloud_train.sh ${TRAIN_MANIFEST} ${DEV_MANIFEST} ${CLOUD_MODEL_DIR} ${NUM_GPU} ${BATCH_SIZE} ${IS_LOCAL}" \
|
|
||||||
${DS2_PATH}
|
|
||||||
|
|
||||||
rm ${DS2_PATH}/pcloud_train.sh
|
|
@ -1,46 +0,0 @@
|
|||||||
#! /usr/bin/env bash
|
|
||||||
|
|
||||||
TRAIN_MANIFEST=$1
|
|
||||||
DEV_MANIFEST=$2
|
|
||||||
MODEL_PATH=$3
|
|
||||||
NUM_GPU=$4
|
|
||||||
BATCH_SIZE=$5
|
|
||||||
IS_LOCAL=$6
|
|
||||||
|
|
||||||
python ./cloud/split_data.py \
|
|
||||||
--in_manifest_path=${TRAIN_MANIFEST} \
|
|
||||||
--out_manifest_path='/local.manifest.train'
|
|
||||||
|
|
||||||
python ./cloud/split_data.py \
|
|
||||||
--in_manifest_path=${DEV_MANIFEST} \
|
|
||||||
--out_manifest_path='/local.manifest.dev'
|
|
||||||
|
|
||||||
mkdir ./logs
|
|
||||||
|
|
||||||
python -u train.py \
|
|
||||||
--batch_size=${BATCH_SIZE} \
|
|
||||||
--trainer_count=${NUM_GPU} \
|
|
||||||
--num_passes=200 \
|
|
||||||
--num_proc_data=${NUM_GPU} \
|
|
||||||
--num_conv_layers=2 \
|
|
||||||
--num_rnn_layers=3 \
|
|
||||||
--rnn_layer_size=2048 \
|
|
||||||
--num_iter_print=100 \
|
|
||||||
--learning_rate=5e-4 \
|
|
||||||
--max_duration=27.0 \
|
|
||||||
--min_duration=0.0 \
|
|
||||||
--use_sortagrad=True \
|
|
||||||
--use_gru=False \
|
|
||||||
--use_gpu=True \
|
|
||||||
--is_local=${IS_LOCAL} \
|
|
||||||
--share_rnn_weights=True \
|
|
||||||
--train_manifest='/local.manifest.train' \
|
|
||||||
--dev_manifest='/local.manifest.dev' \
|
|
||||||
--mean_std_path='data/librispeech/mean_std.npz' \
|
|
||||||
--vocab_path='data/librispeech/vocab.txt' \
|
|
||||||
--output_model_dir='./checkpoints' \
|
|
||||||
--output_model_dir=${MODEL_PATH} \
|
|
||||||
--augment_conf_path='conf/augmentation.config' \
|
|
||||||
--specgram_type='linear' \
|
|
||||||
--shuffle_method='batch_shuffle_clipped' \
|
|
||||||
2>&1 | tee ./logs/train.log
|
|
@ -1,22 +0,0 @@
|
|||||||
#! /usr/bin/env bash
|
|
||||||
|
|
||||||
mkdir cloud_manifests
|
|
||||||
|
|
||||||
IN_MANIFESTS="../data/librispeech/manifest.train ../data/librispeech/manifest.dev-clean ../data/librispeech/manifest.test-clean"
|
|
||||||
OUT_MANIFESTS="cloud_manifests/cloud.manifest.train cloud_manifests/cloud.manifest.dev cloud_manifests/cloud.manifest.test"
|
|
||||||
CLOUD_DATA_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/data/librispeech"
|
|
||||||
NUM_SHARDS=50
|
|
||||||
|
|
||||||
python upload_data.py \
|
|
||||||
--in_manifest_paths ${IN_MANIFESTS} \
|
|
||||||
--out_manifest_paths ${OUT_MANIFESTS} \
|
|
||||||
--cloud_data_dir ${CLOUD_DATA_DIR} \
|
|
||||||
--num_shards ${NUM_SHARDS}
|
|
||||||
|
|
||||||
if [ $? -ne 0 ]
|
|
||||||
then
|
|
||||||
echo "Upload Data Failed!"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "All Done."
|
|
@ -1,41 +0,0 @@
|
|||||||
"""This tool is used for splitting data into each node of
|
|
||||||
paddlecloud. This script should be called in paddlecloud.
|
|
||||||
"""
|
|
||||||
from __future__ import absolute_import
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import os
|
|
||||||
import json
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description=__doc__)
|
|
||||||
parser.add_argument(
|
|
||||||
"--in_manifest_path",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Input manifest path for all nodes.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--out_manifest_path",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Output manifest file path for current node.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def split_data(in_manifest_path, out_manifest_path):
|
|
||||||
with open("/trainer_id", "r") as f:
|
|
||||||
trainer_id = int(f.readline()[:-1])
|
|
||||||
with open("/trainer_count", "r") as f:
|
|
||||||
trainer_count = int(f.readline()[:-1])
|
|
||||||
|
|
||||||
out_manifest = []
|
|
||||||
for index, json_line in enumerate(open(in_manifest_path, 'r')):
|
|
||||||
if (index % trainer_count) == trainer_id:
|
|
||||||
out_manifest.append("%s\n" % json_line.strip())
|
|
||||||
with open(out_manifest_path, 'w') as f:
|
|
||||||
f.writelines(out_manifest)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
split_data(args.in_manifest_path, args.out_manifest_path)
|
|
@ -1,129 +0,0 @@
|
|||||||
"""This script is for uploading data for DeepSpeech2 training on paddlecloud.
|
|
||||||
|
|
||||||
Steps:
|
|
||||||
1. Read original manifests and extract local sound files.
|
|
||||||
2. Tar all local sound files into multiple tar files and upload them.
|
|
||||||
3. Modify original manifests with updated paths in cloud filesystem.
|
|
||||||
"""
|
|
||||||
from __future__ import absolute_import
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import tarfile
|
|
||||||
import sys
|
|
||||||
import argparse
|
|
||||||
import shutil
|
|
||||||
from subprocess import call
|
|
||||||
import _init_paths
|
|
||||||
from data_utils.utils import read_manifest
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description=__doc__)
|
|
||||||
parser.add_argument(
|
|
||||||
"--in_manifest_paths",
|
|
||||||
default=[
|
|
||||||
"../datasets/manifest.train", "../datasets/manifest.dev",
|
|
||||||
"../datasets/manifest.test"
|
|
||||||
],
|
|
||||||
type=str,
|
|
||||||
nargs='+',
|
|
||||||
help="Local filepaths of input manifests to load, pack and upload."
|
|
||||||
"(default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--out_manifest_paths",
|
|
||||||
default=[
|
|
||||||
"./cloud.manifest.train", "./cloud.manifest.dev",
|
|
||||||
"./cloud.manifest.test"
|
|
||||||
],
|
|
||||||
type=str,
|
|
||||||
nargs='+',
|
|
||||||
help="Local filepaths of modified manifests to write to. "
|
|
||||||
"(default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--cloud_data_dir",
|
|
||||||
required=True,
|
|
||||||
type=str,
|
|
||||||
help="Destination directory on paddlecloud to upload data to.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--num_shards",
|
|
||||||
default=10,
|
|
||||||
type=int,
|
|
||||||
help="Number of parts to split data to. (default: %(default)s)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--local_tmp_dir",
|
|
||||||
default="./tmp/",
|
|
||||||
type=str,
|
|
||||||
help="Local directory for storing temporary data. (default: %(default)s)")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def upload_data(in_manifest_path_list, out_manifest_path_list, local_tmp_dir,
|
|
||||||
upload_tar_dir, num_shards):
|
|
||||||
"""Extract and pack sound files listed in the manifest files into multple
|
|
||||||
tar files and upload them to padldecloud. Besides, generate new manifest
|
|
||||||
files with updated paths in paddlecloud.
|
|
||||||
"""
|
|
||||||
# compute total audio number
|
|
||||||
total_line = 0
|
|
||||||
for manifest_path in in_manifest_path_list:
|
|
||||||
with open(manifest_path, 'r') as f:
|
|
||||||
total_line += len(f.readlines())
|
|
||||||
line_per_tar = (total_line // num_shards) + 1
|
|
||||||
|
|
||||||
# pack and upload shard by shard
|
|
||||||
line_count, tar_file = 0, None
|
|
||||||
for manifest_path, out_manifest_path in zip(in_manifest_path_list,
|
|
||||||
out_manifest_path_list):
|
|
||||||
manifest = read_manifest(manifest_path)
|
|
||||||
out_manifest = []
|
|
||||||
for json_data in manifest:
|
|
||||||
sound_filepath = json_data['audio_filepath']
|
|
||||||
sound_filename = os.path.basename(sound_filepath)
|
|
||||||
if line_count % line_per_tar == 0:
|
|
||||||
if tar_file != None:
|
|
||||||
tar_file.close()
|
|
||||||
pcloud_cp(tar_path, upload_tar_dir)
|
|
||||||
os.remove(tar_path)
|
|
||||||
tar_name = 'part-%s-of-%s.tar' % (
|
|
||||||
str(line_count // line_per_tar).zfill(5),
|
|
||||||
str(num_shards).zfill(5))
|
|
||||||
tar_path = os.path.join(local_tmp_dir, tar_name)
|
|
||||||
tar_file = tarfile.open(tar_path, 'w')
|
|
||||||
tar_file.add(sound_filepath, arcname=sound_filename)
|
|
||||||
line_count += 1
|
|
||||||
json_data['audio_filepath'] = "tar:%s#%s" % (
|
|
||||||
os.path.join(upload_tar_dir, tar_name), sound_filename)
|
|
||||||
out_manifest.append("%s\n" % json.dumps(json_data))
|
|
||||||
with open(out_manifest_path, 'w') as f:
|
|
||||||
f.writelines(out_manifest)
|
|
||||||
pcloud_cp(out_manifest_path, upload_tar_dir)
|
|
||||||
tar_file.close()
|
|
||||||
pcloud_cp(tar_path, upload_tar_dir)
|
|
||||||
os.remove(tar_path)
|
|
||||||
|
|
||||||
|
|
||||||
def pcloud_mkdir(dir):
|
|
||||||
"""Make directory in PaddleCloud filesystem.
|
|
||||||
"""
|
|
||||||
if call(['paddlecloud', 'mkdir', dir]) != 0:
|
|
||||||
raise IOError("PaddleCloud mkdir failed: %s." % dir)
|
|
||||||
|
|
||||||
|
|
||||||
def pcloud_cp(src, dst):
|
|
||||||
"""Copy src from local filesytem to dst in PaddleCloud filesystem,
|
|
||||||
or downlowd src from PaddleCloud filesystem to dst in local filesystem.
|
|
||||||
"""
|
|
||||||
if call(['paddlecloud', 'cp', src, dst]) != 0:
|
|
||||||
raise IOError("PaddleCloud cp failed: from [%s] to [%s]." % (src, dst))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
if not os.path.exists(args.local_tmp_dir):
|
|
||||||
os.makedirs(args.local_tmp_dir)
|
|
||||||
pcloud_mkdir(args.cloud_data_dir)
|
|
||||||
|
|
||||||
upload_data(args.in_manifest_paths, args.out_manifest_paths,
|
|
||||||
args.local_tmp_dir, args.cloud_data_dir, args.num_shards)
|
|
||||||
|
|
||||||
shutil.rmtree(args.local_tmp_dir)
|
|
Before Width: | Height: | Size: 153 KiB After Width: | Height: | Size: 206 KiB |
@ -1,4 +1,4 @@
|
|||||||
scipy==0.13.1
|
scipy==1.2.1
|
||||||
resampy==0.1.5
|
resampy==0.1.5
|
||||||
SoundFile==0.9.0.post1
|
SoundFile==0.9.0.post1
|
||||||
python_speech_features
|
python_speech_features
|
||||||
|
Loading…
Reference in new issue