@ -1,10 +1,9 @@
"""
"""
Download , unpack and create manifest file for the Librespeech dataset .
Download , unpack and create manifest json files for the Librespeech dataset .
A manifest file is a dataset summarization , with each line a json format
A manifest is a json file summarizing filelist in a data set , with each line
string containing meta data for one audio clip , including its filepath ,
containing the meta data ( i . e . audio filepath , transcription text , audio
transcription string , and duration . It serves as a unified interface for
duration ) of each audio file in the data set .
different data sets .
"""
"""
import paddle . v2 as paddle
import paddle . v2 as paddle
@ -36,14 +35,6 @@ MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
MD5_TRAIN_CLEAN_360 = " c0e676e450a7ff2f54aeade5171606fa "
MD5_TRAIN_CLEAN_360 = " c0e676e450a7ff2f54aeade5171606fa "
MD5_TRAIN_OTHER_500 = " d1a0fd59409feb2c614ce4d30c387708 "
MD5_TRAIN_OTHER_500 = " d1a0fd59409feb2c614ce4d30c387708 "
NUM_LINES_TEST_CLEAN = 2620
NUM_LINES_TEST_OTHER = 2939
NUM_LINES_DEV_CLEAN = 2703
NUM_LINES_DEV_OTHER = 2864
NUM_LINES_TRAIN_CLEAN_100 = 28539
NUM_LINES_TRAIN_CLEAN_360 = 104014
NUM_LINES_TRAIN_OTHER_500 = 148688
parser = argparse . ArgumentParser (
parser = argparse . ArgumentParser (
description = ' Downloads and prepare LibriSpeech dataset. ' )
description = ' Downloads and prepare LibriSpeech dataset. ' )
parser . add_argument (
parser . add_argument (
@ -95,12 +86,9 @@ def unpack(filepath, target_dir):
def create_manifest ( data_dir , manifest_path ) :
def create_manifest ( data_dir , manifest_path ) :
"""
"""
Create a manifest file summarizing the dataset ( list of filepath and meta
Create a manifest json file summarizing the data set , with each line
data ) .
containing the meta data ( i . e . audio filepath , transcription text , audio
duration ) of each audio file within the data set .
Each line of the manifest contains one audio clip filepath , its
transcription text string , and its duration . Manifest file servers as a
unified interfance to organize data sets .
"""
"""
print ( " Creating manifest %s ... " % manifest_path )
print ( " Creating manifest %s ... " % manifest_path )
json_lines = [ ]
json_lines = [ ]
@ -128,28 +116,20 @@ def create_manifest(data_dir, manifest_path):
out_file . write ( line + ' \n ' )
out_file . write ( line + ' \n ' )
def verify_file_line_number ( filepath , num_lines ) :
def prepare_dataset ( url , md5sum , target_dir , manifest_path ) :
with open ( filepath , ' r ' ) as file :
return len ( file . readlines ( ) ) == num_lines
def prepare_dataset ( url , md5sum , target_dir , manifest_path , num_lines ) :
"""
"""
Download , unpack and create summmary manifest file .
Download , unpack and create summmary manifest file .
"""
"""
if not os . path . exists ( os . path . join ( target_dir , " LibriSpeech " ) ) :
# download
# download
filepath = download ( url , md5sum , target_dir )
filepath = download ( url , md5sum , target_dir )
# unpack
# unpack
if not os . path . exists ( os . path . join ( target_dir , " LibriSpeech " ) ) :
unpack ( filepath , target_dir )
unpack ( filepath , target_dir )
else :
else :
print ( " Unpacked data exists, skip unpacking. " )
print ( " Skip downloading and unpacking. Data already exists in %s . " %
# create manifest and verify line number
target_dir )
# create manifest json file
create_manifest ( target_dir , manifest_path )
create_manifest ( target_dir , manifest_path )
if not verify_file_line_number ( manifest_path , num_lines ) :
raise RuntimeError ( " Manifest line number check failed. "
" Please remove directory and try running the script "
" again. " )
def main ( ) :
def main ( ) :
@ -157,45 +137,38 @@ def main():
url = URL_TEST_CLEAN ,
url = URL_TEST_CLEAN ,
md5sum = MD5_TEST_CLEAN ,
md5sum = MD5_TEST_CLEAN ,
target_dir = os . path . join ( args . target_dir , " test-clean " ) ,
target_dir = os . path . join ( args . target_dir , " test-clean " ) ,
manifest_path = args . manifest_prefix + " .test-clean " ,
manifest_path = args . manifest_prefix + " .test-clean " )
num_lines = NUM_LINES_TEST_CLEAN )
prepare_dataset (
prepare_dataset (
url = URL_DEV_CLEAN ,
url = URL_DEV_CLEAN ,
md5sum = MD5_DEV_CLEAN ,
md5sum = MD5_DEV_CLEAN ,
target_dir = os . path . join ( args . target_dir , " dev-clean " ) ,
target_dir = os . path . join ( args . target_dir , " dev-clean " ) ,
manifest_path = args . manifest_prefix + " .dev-clean " ,
manifest_path = args . manifest_prefix + " .dev-clean " )
num_lines = NUM_LINES_DEV_CLEAN )
prepare_dataset (
prepare_dataset (
url = URL_TRAIN_CLEAN_100 ,
url = URL_TRAIN_CLEAN_100 ,
md5sum = MD5_TRAIN_CLEAN_100 ,
md5sum = MD5_TRAIN_CLEAN_100 ,
target_dir = os . path . join ( args . target_dir , " train-clean-100 " ) ,
target_dir = os . path . join ( args . target_dir , " train-clean-100 " ) ,
manifest_path = args . manifest_prefix + " .train-clean-100 " ,
manifest_path = args . manifest_prefix + " .train-clean-100 " )
num_lines = NUM_LINES_TRAIN_CLEAN_100 )
if args . full_download :
if args . full_download :
prepare_dataset (
prepare_dataset (
url = URL_TEST_OTHER ,
url = URL_TEST_OTHER ,
md5sum = MD5_TEST_OTHER ,
md5sum = MD5_TEST_OTHER ,
target_dir = os . path . join ( args . target_dir , " test-other " ) ,
target_dir = os . path . join ( args . target_dir , " test-other " ) ,
manifest_path = args . manifest_prefix + " .test-other " ,
manifest_path = args . manifest_prefix + " .test-other " )
num_lines = NUM_LINES_TEST_OTHER )
prepare_dataset (
prepare_dataset (
url = URL_DEV_OTHER ,
url = URL_DEV_OTHER ,
md5sum = MD5_DEV_OTHER ,
md5sum = MD5_DEV_OTHER ,
target_dir = os . path . join ( args . target_dir , " dev-other " ) ,
target_dir = os . path . join ( args . target_dir , " dev-other " ) ,
manifest_path = args . manifest_prefix + " .dev-other " ,
manifest_path = args . manifest_prefix + " .dev-other " )
num_lines = NUM_LINES_DEV_OTHER )
prepare_dataset (
prepare_dataset (
url = URL_TRAIN_CLEAN_360 ,
url = URL_TRAIN_CLEAN_360 ,
md5sum = MD5_TRAIN_CLEAN_360 ,
md5sum = MD5_TRAIN_CLEAN_360 ,
target_dir = os . path . join ( args . target_dir , " train-clean-360 " ) ,
target_dir = os . path . join ( args . target_dir , " train-clean-360 " ) ,
manifest_path = args . manifest_prefix + " .train-clean-360 " ,
manifest_path = args . manifest_prefix + " .train-clean-360 " )
num_lines = NUM_LINES_TRAIN_CLEAN_360 )
prepare_dataset (
prepare_dataset (
url = URL_TRAIN_OTHER_500 ,
url = URL_TRAIN_OTHER_500 ,
md5sum = MD5_TRAIN_OTHER_500 ,
md5sum = MD5_TRAIN_OTHER_500 ,
target_dir = os . path . join ( args . target_dir , " train-other-500 " ) ,
target_dir = os . path . join ( args . target_dir , " train-other-500 " ) ,
manifest_path = args . manifest_prefix + " .train-other-500 " ,
manifest_path = args . manifest_prefix + " .train-other-500 " )
num_lines = NUM_LINES_TRAIN_OTHER_500 )
if __name__ == ' __main__ ' :
if __name__ == ' __main__ ' :