Refine librispeech.py for DeepSpeech2.

Summary:
1. Add manifest line check.
2. Avoid re-unpacking if unpacked data already exists.
3. Add full_download (download all 7 sub-datasets of LibriSpeech).
pull/2/head
Xinghai Sun 7 years ago
parent 730d5c4dd3
commit d3eeb7fd76

@ -18,6 +18,7 @@ For some machines, we also need to install libsndfile1. Details to be added.
``` ```
cd data cd data
python librispeech.py python librispeech.py
cat manifest.libri.train-* > manifest.libri.train-all
cd .. cd ..
``` ```
@ -32,13 +33,13 @@ python librispeech.py --help
For GPU Training: For GPU Training:
``` ```
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 --train_manifest_path ./data/manifest.libri.train-all
``` ```
For CPU Training: For CPU Training:
``` ```
python train.py --trainer_count 8 --use_gpu False python train.py --trainer_count 8 --use_gpu False -- train_manifest_path ./data/manifest.libri.train-all
``` ```
More help for arguments: More help for arguments:

@ -1,13 +1,15 @@
""" """
Download, unpack and create manifest for Librespeech dataset. Download, unpack and create manifest file for the Librespeech dataset.
Manifest is a json file with each line containing one audio clip filepath, A manifest file is a dataset summarization, with each line a json format
its transcription text string, and its duration. It servers as a unified string containing meta data for one audio clip, including its filepath,
interfance to organize different data sets. transcription string, and duration. It serves as a unified interface for
different data sets.
""" """
import paddle.v2 as paddle import paddle.v2 as paddle
from paddle.v2.dataset.common import md5file from paddle.v2.dataset.common import md5file
import distutils.util
import os import os
import wget import wget
import tarfile import tarfile
@ -27,11 +29,21 @@ URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz" URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"
MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9" MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1" MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
NUM_LINES_TEST_CLEAN = 2620
NUM_LINES_TEST_OTHER = 2939
NUM_LINES_DEV_CLEAN = 2703
NUM_LINES_DEV_OTHER = 2864
NUM_LINES_TRAIN_CLEAN_100 = 28539
NUM_LINES_TRAIN_CLEAN_360 = 104014
NUM_LINES_TRAIN_OTHER_500 = 148688
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Downloads and prepare LibriSpeech dataset.') description='Downloads and prepare LibriSpeech dataset.')
parser.add_argument( parser.add_argument(
@ -44,6 +56,13 @@ parser.add_argument(
default="manifest.libri", default="manifest.libri",
type=str, type=str,
help="Filepath prefix for output manifests. (default: %(default)s)") help="Filepath prefix for output manifests. (default: %(default)s)")
parser.add_argument(
"--full_download",
default="True",
type=distutils.util.strtobool,
help="Download all datasets for Librispeech."
" If False, only download a minimal requirement (test-clean, dev-clean"
" train-clean-100). (default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
@ -57,7 +76,10 @@ def download(url, md5sum, target_dir):
print("Downloading %s ..." % url) print("Downloading %s ..." % url)
wget.download(url, target_dir) wget.download(url, target_dir)
print("\nMD5 Chesksum %s ..." % filepath) print("\nMD5 Chesksum %s ..." % filepath)
assert md5file(filepath) == md5sum, "MD5 checksum failed." if not md5file(filepath) == md5sum:
raise RuntimeError("MD5 checksum failed.")
else:
print("File exists, skip downloading. (%s)" % filepath)
return filepath return filepath
@ -69,7 +91,6 @@ def unpack(filepath, target_dir):
tar = tarfile.open(filepath) tar = tarfile.open(filepath)
tar.extractall(target_dir) tar.extractall(target_dir)
tar.close() tar.close()
return target_dir
def create_manifest(data_dir, manifest_path): def create_manifest(data_dir, manifest_path):
@ -83,7 +104,7 @@ def create_manifest(data_dir, manifest_path):
""" """
print("Creating manifest %s ..." % manifest_path) print("Creating manifest %s ..." % manifest_path)
json_lines = [] json_lines = []
for subfolder, _, filelist in os.walk(data_dir): for subfolder, _, filelist in sorted(os.walk(data_dir)):
text_filelist = [ text_filelist = [
filename for filename in filelist if filename.endswith('trans.txt') filename for filename in filelist if filename.endswith('trans.txt')
] ]
@ -107,13 +128,28 @@ def create_manifest(data_dir, manifest_path):
out_file.write(line + '\n') out_file.write(line + '\n')
def prepare_dataset(url, md5sum, target_dir, manifest_path): def verify_file_line_number(filepath, num_lines):
with open(filepath, 'r') as file:
return len(file.readlines()) == num_lines
def prepare_dataset(url, md5sum, target_dir, manifest_path, num_lines):
""" """
Download, unpack and create summmary manifest file. Download, unpack and create summmary manifest file.
""" """
# download
filepath = download(url, md5sum, target_dir) filepath = download(url, md5sum, target_dir)
unpacked_dir = unpack(filepath, target_dir) # unpack
create_manifest(unpacked_dir, manifest_path) if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
unpack(filepath, target_dir)
else:
print("Unpacked data exists, skip unpacking.")
# create manifest and verify line number
create_manifest(target_dir, manifest_path)
if not verify_file_line_number(manifest_path, num_lines):
raise RuntimeError("Manifest line number check failed. "
"Please remove directory and try running the script "
"again.")
def main(): def main():
@ -121,17 +157,45 @@ def main():
url=URL_TEST_CLEAN, url=URL_TEST_CLEAN,
md5sum=MD5_TEST_CLEAN, md5sum=MD5_TEST_CLEAN,
target_dir=os.path.join(args.target_dir, "test-clean"), target_dir=os.path.join(args.target_dir, "test-clean"),
manifest_path=args.manifest_prefix + ".test-clean") manifest_path=args.manifest_prefix + ".test-clean",
num_lines=NUM_LINES_TEST_CLEAN)
prepare_dataset( prepare_dataset(
url=URL_DEV_CLEAN, url=URL_DEV_CLEAN,
md5sum=MD5_DEV_CLEAN, md5sum=MD5_DEV_CLEAN,
target_dir=os.path.join(args.target_dir, "dev-clean"), target_dir=os.path.join(args.target_dir, "dev-clean"),
manifest_path=args.manifest_prefix + ".dev-clean") manifest_path=args.manifest_prefix + ".dev-clean",
num_lines=NUM_LINES_DEV_CLEAN)
prepare_dataset( prepare_dataset(
url=URL_TRAIN_CLEAN_100, url=URL_TRAIN_CLEAN_100,
md5sum=MD5_TRAIN_CLEAN_100, md5sum=MD5_TRAIN_CLEAN_100,
target_dir=os.path.join(args.target_dir, "train-clean-100"), target_dir=os.path.join(args.target_dir, "train-clean-100"),
manifest_path=args.manifest_prefix + ".train-clean-100") manifest_path=args.manifest_prefix + ".train-clean-100",
num_lines=NUM_LINES_TRAIN_CLEAN_100)
if args.full_download:
prepare_dataset(
url=URL_TEST_OTHER,
md5sum=MD5_TEST_OTHER,
target_dir=os.path.join(args.target_dir, "test-other"),
manifest_path=args.manifest_prefix + ".test-other",
num_lines=NUM_LINES_TEST_OTHER)
prepare_dataset(
url=URL_DEV_OTHER,
md5sum=MD5_DEV_OTHER,
target_dir=os.path.join(args.target_dir, "dev-other"),
manifest_path=args.manifest_prefix + ".dev-other",
num_lines=NUM_LINES_DEV_OTHER)
prepare_dataset(
url=URL_TRAIN_CLEAN_360,
md5sum=MD5_TRAIN_CLEAN_360,
target_dir=os.path.join(args.target_dir, "train-clean-360"),
manifest_path=args.manifest_prefix + ".train-clean-360",
num_lines=NUM_LINES_TRAIN_CLEAN_360)
prepare_dataset(
url=URL_TRAIN_OTHER_500,
md5sum=MD5_TRAIN_OTHER_500,
target_dir=os.path.join(args.target_dir, "train-other-500"),
manifest_path=args.manifest_prefix + ".train-other-500",
num_lines=NUM_LINES_TRAIN_OTHER_500)
if __name__ == '__main__': if __name__ == '__main__':

Loading…
Cancel
Save