@ -1,13 +1,15 @@
"""
Download , unpack and create manifest for Librespeech dataset .
Download , unpack and create manifest file for the Librespeech dataset .
Manifest is a json file with each line containing one audio clip filepath ,
its transcription text string , and its duration . It servers as a unified
interfance to organize different data sets .
A manifest file is a dataset summarization , with each line a json format
string containing meta data for one audio clip , including its filepath ,
transcription string , and duration . It serves as a unified interface for
different data sets .
"""
import paddle . v2 as paddle
from paddle . v2 . dataset . common import md5file
import distutils . util
import os
import wget
import tarfile
@ -27,11 +29,21 @@ URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
URL_TRAIN_OTHER_500 = URL_ROOT + " /train-other-500.tar.gz "
MD5_TEST_CLEAN = " 32fa31d27d2e1cad72775fee3f4849a9 "
MD5_TEST_OTHER = " fb5a50374b501bb3bac4815ee91d3135 "
MD5_DEV_CLEAN = " 42e2234ba48799c1f50f24a7926300a1 "
MD5_DEV_OTHER = " c8d0bcc9cca99d4f8b62fcc847357931 "
MD5_TRAIN_CLEAN_100 = " 2a93770f6d5c6c964bc36631d331a522 "
MD5_TRAIN_CLEAN_360 = " c0e676e450a7ff2f54aeade5171606fa "
MD5_TRAIN_OTHER_500 = " d1a0fd59409feb2c614ce4d30c387708 "
NUM_LINES_TEST_CLEAN = 2620
NUM_LINES_TEST_OTHER = 2939
NUM_LINES_DEV_CLEAN = 2703
NUM_LINES_DEV_OTHER = 2864
NUM_LINES_TRAIN_CLEAN_100 = 28539
NUM_LINES_TRAIN_CLEAN_360 = 104014
NUM_LINES_TRAIN_OTHER_500 = 148688
parser = argparse . ArgumentParser (
description = ' Downloads and prepare LibriSpeech dataset. ' )
parser . add_argument (
@ -44,6 +56,13 @@ parser.add_argument(
default = " manifest.libri " ,
type = str ,
help = " Filepath prefix for output manifests. (default: %(default)s ) " )
parser . add_argument (
" --full_download " ,
default = " True " ,
type = distutils . util . strtobool ,
help = " Download all datasets for Librispeech. "
" If False, only download a minimal requirement (test-clean, dev-clean "
" train-clean-100). (default: %(default)s ) " )
args = parser . parse_args ( )
@ -57,7 +76,10 @@ def download(url, md5sum, target_dir):
print ( " Downloading %s ... " % url )
wget . download ( url , target_dir )
print ( " \n MD5 Chesksum %s ... " % filepath )
assert md5file ( filepath ) == md5sum , " MD5 checksum failed. "
if not md5file ( filepath ) == md5sum :
raise RuntimeError ( " MD5 checksum failed. " )
else :
print ( " File exists, skip downloading. ( %s ) " % filepath )
return filepath
@ -69,7 +91,6 @@ def unpack(filepath, target_dir):
tar = tarfile . open ( filepath )
tar . extractall ( target_dir )
tar . close ( )
return target_dir
def create_manifest ( data_dir , manifest_path ) :
@ -83,7 +104,7 @@ def create_manifest(data_dir, manifest_path):
"""
print ( " Creating manifest %s ... " % manifest_path )
json_lines = [ ]
for subfolder , _ , filelist in os . walk ( data_dir ) :
for subfolder , _ , filelist in sorted ( os . walk ( data_dir ) ) :
text_filelist = [
filename for filename in filelist if filename . endswith ( ' trans.txt ' )
]
@ -107,13 +128,28 @@ def create_manifest(data_dir, manifest_path):
out_file . write ( line + ' \n ' )
def prepare_dataset ( url , md5sum , target_dir , manifest_path ) :
def verify_file_line_number ( filepath , num_lines ) :
with open ( filepath , ' r ' ) as file :
return len ( file . readlines ( ) ) == num_lines
def prepare_dataset ( url , md5sum , target_dir , manifest_path , num_lines ) :
"""
Download , unpack and create summmary manifest file .
"""
# download
filepath = download ( url , md5sum , target_dir )
unpacked_dir = unpack ( filepath , target_dir )
create_manifest ( unpacked_dir , manifest_path )
# unpack
if not os . path . exists ( os . path . join ( target_dir , " LibriSpeech " ) ) :
unpack ( filepath , target_dir )
else :
print ( " Unpacked data exists, skip unpacking. " )
# create manifest and verify line number
create_manifest ( target_dir , manifest_path )
if not verify_file_line_number ( manifest_path , num_lines ) :
raise RuntimeError ( " Manifest line number check failed. "
" Please remove directory and try running the script "
" again. " )
def main ( ) :
@ -121,17 +157,45 @@ def main():
url = URL_TEST_CLEAN ,
md5sum = MD5_TEST_CLEAN ,
target_dir = os . path . join ( args . target_dir , " test-clean " ) ,
manifest_path = args . manifest_prefix + " .test-clean " )
manifest_path = args . manifest_prefix + " .test-clean " ,
num_lines = NUM_LINES_TEST_CLEAN )
prepare_dataset (
url = URL_DEV_CLEAN ,
md5sum = MD5_DEV_CLEAN ,
target_dir = os . path . join ( args . target_dir , " dev-clean " ) ,
manifest_path = args . manifest_prefix + " .dev-clean " )
manifest_path = args . manifest_prefix + " .dev-clean " ,
num_lines = NUM_LINES_DEV_CLEAN )
prepare_dataset (
url = URL_TRAIN_CLEAN_100 ,
md5sum = MD5_TRAIN_CLEAN_100 ,
target_dir = os . path . join ( args . target_dir , " train-clean-100 " ) ,
manifest_path = args . manifest_prefix + " .train-clean-100 " )
manifest_path = args . manifest_prefix + " .train-clean-100 " ,
num_lines = NUM_LINES_TRAIN_CLEAN_100 )
if args . full_download :
prepare_dataset (
url = URL_TEST_OTHER ,
md5sum = MD5_TEST_OTHER ,
target_dir = os . path . join ( args . target_dir , " test-other " ) ,
manifest_path = args . manifest_prefix + " .test-other " ,
num_lines = NUM_LINES_TEST_OTHER )
prepare_dataset (
url = URL_DEV_OTHER ,
md5sum = MD5_DEV_OTHER ,
target_dir = os . path . join ( args . target_dir , " dev-other " ) ,
manifest_path = args . manifest_prefix + " .dev-other " ,
num_lines = NUM_LINES_DEV_OTHER )
prepare_dataset (
url = URL_TRAIN_CLEAN_360 ,
md5sum = MD5_TRAIN_CLEAN_360 ,
target_dir = os . path . join ( args . target_dir , " train-clean-360 " ) ,
manifest_path = args . manifest_prefix + " .train-clean-360 " ,
num_lines = NUM_LINES_TRAIN_CLEAN_360 )
prepare_dataset (
url = URL_TRAIN_OTHER_500 ,
md5sum = MD5_TRAIN_OTHER_500 ,
target_dir = os . path . join ( args . target_dir , " train-other-500 " ) ,
manifest_path = args . manifest_prefix + " .train-other-500 " ,
num_lines = NUM_LINES_TRAIN_OTHER_500 )
if __name__ == ' __main__ ' :