Refine librispeech.py for DeepSpeech2.

Summary:
1. Add manifest line check.
2. Avoid re-unpacking if unpacked data already exists.
3. Add full_download (download all 7 sub-datasets of LibriSpeech).
pull/2/head
Xinghai Sun 8 years ago
parent 730d5c4dd3
commit d3eeb7fd76

@ -18,6 +18,7 @@ For some machines, we also need to install libsndfile1. Details to be added.
```
cd data
python librispeech.py
cat manifest.libri.train-* > manifest.libri.train-all
cd ..
```
@ -32,13 +33,13 @@ python librispeech.py --help
For GPU Training:
```
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 --train_manifest_path ./data/manifest.libri.train-all
```
For CPU Training:
```
python train.py --trainer_count 8 --use_gpu False
python train.py --trainer_count 8 --use_gpu False -- train_manifest_path ./data/manifest.libri.train-all
```
More help for arguments:

@ -1,13 +1,15 @@
"""
Download, unpack and create manifest for Librespeech dataset.
Download, unpack and create manifest file for the Librespeech dataset.
Manifest is a json file with each line containing one audio clip filepath,
its transcription text string, and its duration. It servers as a unified
interfance to organize different data sets.
A manifest file is a dataset summarization, with each line a json format
string containing meta data for one audio clip, including its filepath,
transcription string, and duration. It serves as a unified interface for
different data sets.
"""
import paddle.v2 as paddle
from paddle.v2.dataset.common import md5file
import distutils.util
import os
import wget
import tarfile
@ -27,11 +29,21 @@ URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"
MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
NUM_LINES_TEST_CLEAN = 2620
NUM_LINES_TEST_OTHER = 2939
NUM_LINES_DEV_CLEAN = 2703
NUM_LINES_DEV_OTHER = 2864
NUM_LINES_TRAIN_CLEAN_100 = 28539
NUM_LINES_TRAIN_CLEAN_360 = 104014
NUM_LINES_TRAIN_OTHER_500 = 148688
parser = argparse.ArgumentParser(
description='Downloads and prepare LibriSpeech dataset.')
parser.add_argument(
@ -44,6 +56,13 @@ parser.add_argument(
default="manifest.libri",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
parser.add_argument(
"--full_download",
default="True",
type=distutils.util.strtobool,
help="Download all datasets for Librispeech."
" If False, only download a minimal requirement (test-clean, dev-clean"
" train-clean-100). (default: %(default)s)")
args = parser.parse_args()
@ -57,7 +76,10 @@ def download(url, md5sum, target_dir):
print("Downloading %s ..." % url)
wget.download(url, target_dir)
print("\nMD5 Chesksum %s ..." % filepath)
assert md5file(filepath) == md5sum, "MD5 checksum failed."
if not md5file(filepath) == md5sum:
raise RuntimeError("MD5 checksum failed.")
else:
print("File exists, skip downloading. (%s)" % filepath)
return filepath
@ -69,7 +91,6 @@ def unpack(filepath, target_dir):
tar = tarfile.open(filepath)
tar.extractall(target_dir)
tar.close()
return target_dir
def create_manifest(data_dir, manifest_path):
@ -83,7 +104,7 @@ def create_manifest(data_dir, manifest_path):
"""
print("Creating manifest %s ..." % manifest_path)
json_lines = []
for subfolder, _, filelist in os.walk(data_dir):
for subfolder, _, filelist in sorted(os.walk(data_dir)):
text_filelist = [
filename for filename in filelist if filename.endswith('trans.txt')
]
@ -107,13 +128,28 @@ def create_manifest(data_dir, manifest_path):
out_file.write(line + '\n')
def prepare_dataset(url, md5sum, target_dir, manifest_path):
def verify_file_line_number(filepath, num_lines):
with open(filepath, 'r') as file:
return len(file.readlines()) == num_lines
def prepare_dataset(url, md5sum, target_dir, manifest_path, num_lines):
"""
Download, unpack and create summmary manifest file.
"""
# download
filepath = download(url, md5sum, target_dir)
unpacked_dir = unpack(filepath, target_dir)
create_manifest(unpacked_dir, manifest_path)
# unpack
if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
unpack(filepath, target_dir)
else:
print("Unpacked data exists, skip unpacking.")
# create manifest and verify line number
create_manifest(target_dir, manifest_path)
if not verify_file_line_number(manifest_path, num_lines):
raise RuntimeError("Manifest line number check failed. "
"Please remove directory and try running the script "
"again.")
def main():
@ -121,17 +157,45 @@ def main():
url=URL_TEST_CLEAN,
md5sum=MD5_TEST_CLEAN,
target_dir=os.path.join(args.target_dir, "test-clean"),
manifest_path=args.manifest_prefix + ".test-clean")
manifest_path=args.manifest_prefix + ".test-clean",
num_lines=NUM_LINES_TEST_CLEAN)
prepare_dataset(
url=URL_DEV_CLEAN,
md5sum=MD5_DEV_CLEAN,
target_dir=os.path.join(args.target_dir, "dev-clean"),
manifest_path=args.manifest_prefix + ".dev-clean")
manifest_path=args.manifest_prefix + ".dev-clean",
num_lines=NUM_LINES_DEV_CLEAN)
prepare_dataset(
url=URL_TRAIN_CLEAN_100,
md5sum=MD5_TRAIN_CLEAN_100,
target_dir=os.path.join(args.target_dir, "train-clean-100"),
manifest_path=args.manifest_prefix + ".train-clean-100")
manifest_path=args.manifest_prefix + ".train-clean-100",
num_lines=NUM_LINES_TRAIN_CLEAN_100)
if args.full_download:
prepare_dataset(
url=URL_TEST_OTHER,
md5sum=MD5_TEST_OTHER,
target_dir=os.path.join(args.target_dir, "test-other"),
manifest_path=args.manifest_prefix + ".test-other",
num_lines=NUM_LINES_TEST_OTHER)
prepare_dataset(
url=URL_DEV_OTHER,
md5sum=MD5_DEV_OTHER,
target_dir=os.path.join(args.target_dir, "dev-other"),
manifest_path=args.manifest_prefix + ".dev-other",
num_lines=NUM_LINES_DEV_OTHER)
prepare_dataset(
url=URL_TRAIN_CLEAN_360,
md5sum=MD5_TRAIN_CLEAN_360,
target_dir=os.path.join(args.target_dir, "train-clean-360"),
manifest_path=args.manifest_prefix + ".train-clean-360",
num_lines=NUM_LINES_TRAIN_CLEAN_360)
prepare_dataset(
url=URL_TRAIN_OTHER_500,
md5sum=MD5_TRAIN_OTHER_500,
target_dir=os.path.join(args.target_dir, "train-other-500"),
manifest_path=args.manifest_prefix + ".train-other-500",
num_lines=NUM_LINES_TRAIN_OTHER_500)
if __name__ == '__main__':

Loading…
Cancel
Save