@ -1,30 +1,31 @@
""" This tool is used for preparing data for DeepSpeech2 trainning on paddle cloud.
""" This script is used for preparing data for DeepSpeech2 trainning on paddle
cloud .
Steps :
Steps :
1. Read original manifest and get the local path of sound files .
1. Read original manifest and get the local path of sound files .
2. Tar all local sound files into one tar file .
2. Tar all local sound files into one tar file .
3. Modify original manifest to remove the local path information .
3. Modify original manifest to remove the local path information .
Finally , we will get a tar file and a manifest with sound file name , duration
Finally , we will get a tar file and a new manifest .
and text .
"""
"""
from __future__ import absolute_import
from __future__ import absolute_import
from __future__ import division
from __future__ import division
from __future__ import print_function
from __future__ import print_function
import json
import json
import os
import os
import tarfile
import tarfile
import sys
import sys
import argparse
import argparse
import shutil
import shutil
sys . path . append ( ' ../ ' )
from data_utils . utils import read_manifest
from subprocess import call
from subprocess import call
import _init_paths
from data_utils . utils import read_manifest
TRAIN_TAR = " cloud.train.tar "
TRAIN_TAR = " cloud.train.tar "
TRAIN_MANIFEST = " cloud.train.manifest "
TRAIN_MANIFEST = " cloud.train.manifest "
TEST_TAR = " cloud.test .tar"
DEV_TAR = " cloud.dev .tar"
TEST_MANIFEST = " cloud.test .manifest"
DEV_MANIFEST = " cloud.dev .manifest"
VOCAB_FILE = " vocab.txt "
VOCAB_FILE = " vocab.txt "
MEAN_STD_FILE = " mean_std.npz "
MEAN_STD_FILE = " mean_std.npz "
@ -33,41 +34,41 @@ parser.add_argument(
" --train_manifest_path " ,
" --train_manifest_path " ,
default = " ../datasets/manifest.train " ,
default = " ../datasets/manifest.train " ,
type = str ,
type = str ,
help = " Manifest file of train data. (default: %(default)s ) " )
help = " Manifest file path for train data. (default: %(default)s ) " )
parser . add_argument (
parser . add_argument (
" -- test _manifest_path" ,
" -- dev _manifest_path" ,
default = " ../datasets/manifest. test " ,
default = " ../datasets/manifest. dev " ,
type = str ,
type = str ,
help = " Manifest file of test data. (default: %(default)s ) " )
help = " Manifest file path for validation data. (default: %(default)s ) " )
parser . add_argument (
parser . add_argument (
" --vocab_file " ,
" --vocab_file " ,
default = " ../datasets/vocab/eng_vocab.txt " ,
default = " ../datasets/vocab/eng_vocab.txt " ,
type = str ,
type = str ,
help = " Vocab file to be uploaded to paddlecloud. (default: %(default)s ) " )
help = " Vocabulary file to be uploaded to paddlecloud. "
" (default: %(default)s ) " )
parser . add_argument (
parser . add_argument (
" --mean_std_file " ,
" --mean_std_file " ,
default = " ../mean_std.npz " ,
default = " ../mean_std.npz " ,
type = str ,
type = str ,
help = " mean_std file to be uploaded to paddlecloud. (default: %(default)s ) " )
help = " Normalizer ' s statistics (mean and stddev) file to be uploaded to "
" paddlecloud. (default: %(default)s ) " )
parser . add_argument (
parser . add_argument (
" --cloud_data_path " ,
" --cloud_data_path " ,
required = True ,
required = True ,
type = str ,
type = str ,
help = " Destination path on paddlecloud. (default: %(default)s ) " )
help = " Destination path on paddlecloud. (default: %(default)s ) " )
args = parser . parse_args ( )
parser . add_argument (
parser . add_argument (
" --local_tmp_path " ,
" --local_tmp_path " ,
default = " ./tmp/ " ,
default = " ./tmp/ " ,
type = str ,
type = str ,
help = " Local directory for storing temporary data. (default: %(default)s ) " )
help = " Local directory for storing temporary data. (default: %(default)s ) " )
args = parser . parse_args ( )
args = parser . parse_args ( )
def pack_data ( manifest_path , out_tar_path , out_manifest_path ) :
def pack_data ( manifest_path , out_tar_path , out_manifest_path ) :
''' 1. According to the manifest, tar sound files into out_tar_path
""" 1. According to the manifest, tar sound files into out_tar_path.
2. Generate a new manifest for output tar file
2. Generate a new manifest for output tar file .
'''
"""
out_tar = tarfile . open ( out_tar_path , ' w ' )
out_tar = tarfile . open ( out_tar_path , ' w ' )
manifest = read_manifest ( manifest_path )
manifest = read_manifest ( manifest_path )
results = [ ]
results = [ ]
@ -83,11 +84,19 @@ def pack_data(manifest_path, out_tar_path, out_manifest_path):
out_tar . close ( )
out_tar . close ( )
def pcloud_mkdir ( dir ) :
""" Make directory in PaddleCloud filesystem.
"""
if call ( [ ' paddlecloud ' , ' mkdir ' , dir ] ) != 0 :
raise IOError ( " PaddleCloud mkdir failed: %s . " % dir )
def pcloud_cp ( src , dst ) :
def pcloud_cp ( src , dst ) :
""" Copy src from local filesytem to dst in PaddleCloud filesystem.
""" Copy src from local filesytem to dst in PaddleCloud filesystem,
or downlowd src from PaddleCloud filesystem to dst in local filesystem .
"""
"""
ret = call ( [ ' paddlecloud ' , ' cp ' , src , dst ] )
if call ( [ ' paddlecloud ' , ' cp ' , src , dst ] ) != 0 :
return ret
raise IOError ( " PaddleCloud cp failed: from [ %s ] to [ %s ]. " % ( src , dst ) )
def pcloud_exist ( path ) :
def pcloud_exist ( path ) :
@ -100,48 +109,34 @@ def pcloud_exist(path):
if __name__ == ' __main__ ' :
if __name__ == ' __main__ ' :
cloud_train_manifest = os . path . join ( args . cloud_data_path , TRAIN_MANIFEST )
cloud_train_manifest = os . path . join ( args . cloud_data_path , TRAIN_MANIFEST )
cloud_train_tar = os . path . join ( args . cloud_data_path , TRAIN_TAR )
cloud_train_tar = os . path . join ( args . cloud_data_path , TRAIN_TAR )
cloud_ test _manifest = os . path . join ( args . cloud_data_path , TEST _MANIFEST)
cloud_ dev _manifest = os . path . join ( args . cloud_data_path , DEV _MANIFEST)
cloud_ test _tar = os . path . join ( args . cloud_data_path , TEST _TAR)
cloud_ dev _tar = os . path . join ( args . cloud_data_path , DEV _TAR)
cloud_vocab_file = os . path . join ( args . cloud_data_path , VOCAB_FILE )
cloud_vocab_file = os . path . join ( args . cloud_data_path , VOCAB_FILE )
cloud_mean_file = os . path . join ( args . cloud_data_path , MEAN_STD_FILE )
cloud_mean_file = os . path . join ( args . cloud_data_path , MEAN_STD_FILE )
local_train_manifest = os . path . join ( args . local_tmp_path , TRAIN_MANIFEST )
local_train_manifest = os . path . join ( args . local_tmp_path , TRAIN_MANIFEST )
local_train_tar = os . path . join ( args . local_tmp_path , TRAIN_TAR )
local_train_tar = os . path . join ( args . local_tmp_path , TRAIN_TAR )
local_ test _manifest = os . path . join ( args . local_tmp_path , TEST _MANIFEST)
local_ dev _manifest = os . path . join ( args . local_tmp_path , DEV _MANIFEST)
local_ test _tar = os . path . join ( args . local_tmp_path , TEST _TAR)
local_ dev _tar = os . path . join ( args . local_tmp_path , DEV _TAR)
# prepare local and cloud dir
if os . path . exists ( args . local_tmp_path ) :
if os . path . exists ( args . local_tmp_path ) :
shutil . rmtree ( args . local_tmp_path )
shutil . rmtree ( args . local_tmp_path )
os . makedirs ( args . local_tmp_path )
os . makedirs ( args . local_tmp_path )
pcloud_mkdir ( args . cloud_data_path )
# pack and upload train data
pack_data ( args . train_manifest_path , local_train_tar , local_train_manifest )
pcloud_cp ( local_train_manifest , cloud_train_manifest )
pcloud_cp ( local_train_tar , cloud_train_tar )
# pack and upload validation data
pack_data ( args . dev_manifest_path , local_dev_tar , local_dev_manifest )
pcloud_cp ( local_dev_manifest , cloud_dev_manifest )
pcloud_cp ( local_dev_tar , cloud_dev_tar )
# train data
# upload vocab file and mean_std file
if args . train_manifest_path != " " :
pcloud_cp ( args . vocab_file , cloud_vocab_file )
ret = pcloud_exist ( cloud_train_manifest )
pcloud_cp ( args . mean_std_file , cloud_mean_file )
if ret != 0 :
pack_data ( args . train_manifest_path , local_train_tar ,
local_train_manifest )
pcloud_cp ( local_train_manifest , cloud_train_manifest )
pcloud_cp ( local_train_tar , cloud_train_tar )
# test data
if args . test_manifest_path != " " :
ret = pcloud_exist ( cloud_test_manifest )
if ret != 0 :
pack_data ( args . test_manifest_path , local_test_tar ,
local_test_manifest )
pcloud_cp ( local_test_manifest , cloud_test_manifest )
pcloud_cp ( local_test_tar , cloud_test_tar )
# vocab file
if args . vocab_file != " " :
ret = pcloud_exist ( cloud_vocab_file )
if ret != 0 :
pcloud_cp ( args . vocab_file , cloud_vocab_file )
# mean_std file
if args . mean_std_file != " " :
ret = pcloud_exist ( cloud_mean_file )
if ret != 0 :
pcloud_cp ( args . mean_std_file , cloud_mean_file )
shutil . rmtree ( args . local_tmp_path )
shutil . rmtree ( args . local_tmp_path )