|
|
|
@ -1,30 +1,31 @@
|
|
|
|
|
"""This tool is used for preparing data for DeepSpeech2 trainning on paddle cloud.
|
|
|
|
|
"""This script is used for preparing data for DeepSpeech2 trainning on paddle
|
|
|
|
|
cloud.
|
|
|
|
|
|
|
|
|
|
Steps:
|
|
|
|
|
1. Read original manifest and get the local path of sound files.
|
|
|
|
|
2. Tar all local sound files into one tar file.
|
|
|
|
|
3. Modify original manifest to remove the local path information.
|
|
|
|
|
|
|
|
|
|
Finally, we will get a tar file and a manifest with sound file name, duration
|
|
|
|
|
and text.
|
|
|
|
|
Finally, we will get a tar file and a new manifest.
|
|
|
|
|
"""
|
|
|
|
|
from __future__ import absolute_import
|
|
|
|
|
from __future__ import division
|
|
|
|
|
from __future__ import print_function
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
import os
|
|
|
|
|
import tarfile
|
|
|
|
|
import sys
|
|
|
|
|
import argparse
|
|
|
|
|
import shutil
|
|
|
|
|
sys.path.append('../')
|
|
|
|
|
from data_utils.utils import read_manifest
|
|
|
|
|
from subprocess import call
|
|
|
|
|
import _init_paths
|
|
|
|
|
from data_utils.utils import read_manifest
|
|
|
|
|
|
|
|
|
|
TRAIN_TAR = "cloud.train.tar"
|
|
|
|
|
TRAIN_MANIFEST = "cloud.train.manifest"
|
|
|
|
|
TEST_TAR = "cloud.test.tar"
|
|
|
|
|
TEST_MANIFEST = "cloud.test.manifest"
|
|
|
|
|
DEV_TAR = "cloud.dev.tar"
|
|
|
|
|
DEV_MANIFEST = "cloud.dev.manifest"
|
|
|
|
|
VOCAB_FILE = "vocab.txt"
|
|
|
|
|
MEAN_STD_FILE = "mean_std.npz"
|
|
|
|
|
|
|
|
|
@ -33,41 +34,41 @@ parser.add_argument(
|
|
|
|
|
"--train_manifest_path",
|
|
|
|
|
default="../datasets/manifest.train",
|
|
|
|
|
type=str,
|
|
|
|
|
help="Manifest file of train data. (default: %(default)s)")
|
|
|
|
|
help="Manifest file path for train data. (default: %(default)s)")
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--test_manifest_path",
|
|
|
|
|
default="../datasets/manifest.test",
|
|
|
|
|
"--dev_manifest_path",
|
|
|
|
|
default="../datasets/manifest.dev",
|
|
|
|
|
type=str,
|
|
|
|
|
help="Manifest file of test data. (default: %(default)s)")
|
|
|
|
|
help="Manifest file path for validation data. (default: %(default)s)")
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--vocab_file",
|
|
|
|
|
default="../datasets/vocab/eng_vocab.txt",
|
|
|
|
|
type=str,
|
|
|
|
|
help="Vocab file to be uploaded to paddlecloud. (default: %(default)s)")
|
|
|
|
|
help="Vocabulary file to be uploaded to paddlecloud. "
|
|
|
|
|
"(default: %(default)s)")
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--mean_std_file",
|
|
|
|
|
default="../mean_std.npz",
|
|
|
|
|
type=str,
|
|
|
|
|
help="mean_std file to be uploaded to paddlecloud. (default: %(default)s)")
|
|
|
|
|
help="Normalizer's statistics (mean and stddev) file to be uploaded to "
|
|
|
|
|
"paddlecloud. (default: %(default)s)")
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--cloud_data_path",
|
|
|
|
|
required=True,
|
|
|
|
|
type=str,
|
|
|
|
|
help="Destination path on paddlecloud. (default: %(default)s)")
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
help="Destination path on paddlecloud. (default: %(default)s)")
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--local_tmp_path",
|
|
|
|
|
default="./tmp/",
|
|
|
|
|
type=str,
|
|
|
|
|
help="Local directory for storing temporary data. (default: %(default)s)")
|
|
|
|
|
help="Local directory for storing temporary data. (default: %(default)s)")
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pack_data(manifest_path, out_tar_path, out_manifest_path):
|
|
|
|
|
'''1. According to the manifest, tar sound files into out_tar_path
|
|
|
|
|
2. Generate a new manifest for output tar file
|
|
|
|
|
'''
|
|
|
|
|
"""1. According to the manifest, tar sound files into out_tar_path.
|
|
|
|
|
2. Generate a new manifest for output tar file.
|
|
|
|
|
"""
|
|
|
|
|
out_tar = tarfile.open(out_tar_path, 'w')
|
|
|
|
|
manifest = read_manifest(manifest_path)
|
|
|
|
|
results = []
|
|
|
|
@ -83,11 +84,19 @@ def pack_data(manifest_path, out_tar_path, out_manifest_path):
|
|
|
|
|
out_tar.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pcloud_mkdir(dir):
|
|
|
|
|
"""Make directory in PaddleCloud filesystem.
|
|
|
|
|
"""
|
|
|
|
|
if call(['paddlecloud', 'mkdir', dir]) != 0:
|
|
|
|
|
raise IOError("PaddleCloud mkdir failed: %s." % dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pcloud_cp(src, dst):
|
|
|
|
|
"""Copy src from local filesytem to dst in PaddleCloud filesystem.
|
|
|
|
|
"""Copy src from local filesytem to dst in PaddleCloud filesystem,
|
|
|
|
|
or downlowd src from PaddleCloud filesystem to dst in local filesystem.
|
|
|
|
|
"""
|
|
|
|
|
ret = call(['paddlecloud', 'cp', src, dst])
|
|
|
|
|
return ret
|
|
|
|
|
if call(['paddlecloud', 'cp', src, dst]) != 0:
|
|
|
|
|
raise IOError("PaddleCloud cp failed: from [%s] to [%s]." % (src, dst))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pcloud_exist(path):
|
|
|
|
@ -100,48 +109,34 @@ def pcloud_exist(path):
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
cloud_train_manifest = os.path.join(args.cloud_data_path, TRAIN_MANIFEST)
|
|
|
|
|
cloud_train_tar = os.path.join(args.cloud_data_path, TRAIN_TAR)
|
|
|
|
|
cloud_test_manifest = os.path.join(args.cloud_data_path, TEST_MANIFEST)
|
|
|
|
|
cloud_test_tar = os.path.join(args.cloud_data_path, TEST_TAR)
|
|
|
|
|
cloud_dev_manifest = os.path.join(args.cloud_data_path, DEV_MANIFEST)
|
|
|
|
|
cloud_dev_tar = os.path.join(args.cloud_data_path, DEV_TAR)
|
|
|
|
|
cloud_vocab_file = os.path.join(args.cloud_data_path, VOCAB_FILE)
|
|
|
|
|
cloud_mean_file = os.path.join(args.cloud_data_path, MEAN_STD_FILE)
|
|
|
|
|
|
|
|
|
|
local_train_manifest = os.path.join(args.local_tmp_path, TRAIN_MANIFEST)
|
|
|
|
|
local_train_tar = os.path.join(args.local_tmp_path, TRAIN_TAR)
|
|
|
|
|
local_test_manifest = os.path.join(args.local_tmp_path, TEST_MANIFEST)
|
|
|
|
|
local_test_tar = os.path.join(args.local_tmp_path, TEST_TAR)
|
|
|
|
|
local_dev_manifest = os.path.join(args.local_tmp_path, DEV_MANIFEST)
|
|
|
|
|
local_dev_tar = os.path.join(args.local_tmp_path, DEV_TAR)
|
|
|
|
|
|
|
|
|
|
# prepare local and cloud dir
|
|
|
|
|
if os.path.exists(args.local_tmp_path):
|
|
|
|
|
shutil.rmtree(args.local_tmp_path)
|
|
|
|
|
os.makedirs(args.local_tmp_path)
|
|
|
|
|
pcloud_mkdir(args.cloud_data_path)
|
|
|
|
|
|
|
|
|
|
# pack and upload train data
|
|
|
|
|
pack_data(args.train_manifest_path, local_train_tar, local_train_manifest)
|
|
|
|
|
pcloud_cp(local_train_manifest, cloud_train_manifest)
|
|
|
|
|
pcloud_cp(local_train_tar, cloud_train_tar)
|
|
|
|
|
|
|
|
|
|
# pack and upload validation data
|
|
|
|
|
pack_data(args.dev_manifest_path, local_dev_tar, local_dev_manifest)
|
|
|
|
|
pcloud_cp(local_dev_manifest, cloud_dev_manifest)
|
|
|
|
|
pcloud_cp(local_dev_tar, cloud_dev_tar)
|
|
|
|
|
|
|
|
|
|
# train data
|
|
|
|
|
if args.train_manifest_path != "":
|
|
|
|
|
ret = pcloud_exist(cloud_train_manifest)
|
|
|
|
|
if ret != 0:
|
|
|
|
|
pack_data(args.train_manifest_path, local_train_tar,
|
|
|
|
|
local_train_manifest)
|
|
|
|
|
pcloud_cp(local_train_manifest, cloud_train_manifest)
|
|
|
|
|
pcloud_cp(local_train_tar, cloud_train_tar)
|
|
|
|
|
|
|
|
|
|
# test data
|
|
|
|
|
if args.test_manifest_path != "":
|
|
|
|
|
ret = pcloud_exist(cloud_test_manifest)
|
|
|
|
|
if ret != 0:
|
|
|
|
|
pack_data(args.test_manifest_path, local_test_tar,
|
|
|
|
|
local_test_manifest)
|
|
|
|
|
pcloud_cp(local_test_manifest, cloud_test_manifest)
|
|
|
|
|
pcloud_cp(local_test_tar, cloud_test_tar)
|
|
|
|
|
|
|
|
|
|
# vocab file
|
|
|
|
|
if args.vocab_file != "":
|
|
|
|
|
ret = pcloud_exist(cloud_vocab_file)
|
|
|
|
|
if ret != 0:
|
|
|
|
|
pcloud_cp(args.vocab_file, cloud_vocab_file)
|
|
|
|
|
|
|
|
|
|
# mean_std file
|
|
|
|
|
if args.mean_std_file != "":
|
|
|
|
|
ret = pcloud_exist(cloud_mean_file)
|
|
|
|
|
if ret != 0:
|
|
|
|
|
pcloud_cp(args.mean_std_file, cloud_mean_file)
|
|
|
|
|
# upload vocab file and mean_std file
|
|
|
|
|
pcloud_cp(args.vocab_file, cloud_vocab_file)
|
|
|
|
|
pcloud_cp(args.mean_std_file, cloud_mean_file)
|
|
|
|
|
|
|
|
|
|
shutil.rmtree(args.local_tmp_path)
|
|
|
|
|