From d2e467385d8367ac072a7d98688466d74661cc4b Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 5 Jun 2017 21:00:15 +0800 Subject: [PATCH 1/4] Add loading model function for train.py. --- train.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index e6a7d076b..14c7cf637 100644 --- a/train.py +++ b/train.py @@ -11,6 +11,7 @@ import sys from model import deep_speech2 from audio_data_utils import DataGenerator import numpy as np +import os #TODO: add WER metric @@ -78,6 +79,11 @@ parser.add_argument( default='data/eng_vocab.txt', type=str, help="Vocabulary filepath. (default: %(default)s)") +parser.add_argument( + "--init_model_path", + default='models/params.tar.gz', + type=str, + help="Model path for initialization. (default: %(default)s)") args = parser.parse_args() @@ -114,8 +120,13 @@ def train(): rnn_size=args.rnn_layer_size, is_inference=False) - # create parameters and optimizer - parameters = paddle.parameters.create(cost) + # create/load parameters and optimizer + if args.init_model_path is None: + parameters = paddle.parameters.create(cost) + else: + assert os.path.isfile(args.init_model_path), "Invalid model." + parameters = paddle.parameters.Parameters.from_tar( + gzip.open(args.init_model_path)) optimizer = paddle.optimizer.Adam( learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400) trainer = paddle.trainer.SGD( From d3eeb7fd76f8b9f86ca01e80f524dde652211428 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Wed, 7 Jun 2017 17:44:11 +0800 Subject: [PATCH 2/4] Refine librispeech.py for DeepSpeech2. Summary: 1. Add manifest line check. 2. Avoid re-unpacking if unpacked data already exists. 3. Add full_download (download all 7 sub-datasets of LibriSpeech). --- README.md | 5 ++- data/librispeech.py | 90 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 80 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index bb1815c00..403511d58 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ For some machines, we also need to install libsndfile1. Details to be added. ``` cd data python librispeech.py +cat manifest.libri.train-* > manifest.libri.train-all cd .. ``` @@ -32,13 +33,13 @@ python librispeech.py --help For GPU Training: ``` -CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 +CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 --train_manifest_path ./data/manifest.libri.train-all ``` For CPU Training: ``` -python train.py --trainer_count 8 --use_gpu False +python train.py --trainer_count 8 --use_gpu False -- train_manifest_path ./data/manifest.libri.train-all ``` More help for arguments: diff --git a/data/librispeech.py b/data/librispeech.py index 838fee597..8bc33575e 100644 --- a/data/librispeech.py +++ b/data/librispeech.py @@ -1,13 +1,15 @@ """ - Download, unpack and create manifest for Librespeech dataset. + Download, unpack and create manifest file for the Librespeech dataset. - Manifest is a json file with each line containing one audio clip filepath, - its transcription text string, and its duration. It servers as a unified - interfance to organize different data sets. + A manifest file is a dataset summarization, with each line a json format + string containing meta data for one audio clip, including its filepath, + transcription string, and duration. It serves as a unified interface for + different data sets. """ import paddle.v2 as paddle from paddle.v2.dataset.common import md5file +import distutils.util import os import wget import tarfile @@ -27,11 +29,21 @@ URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz" URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz" MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9" +MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135" MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1" +MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931" MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" +NUM_LINES_TEST_CLEAN = 2620 +NUM_LINES_TEST_OTHER = 2939 +NUM_LINES_DEV_CLEAN = 2703 +NUM_LINES_DEV_OTHER = 2864 +NUM_LINES_TRAIN_CLEAN_100 = 28539 +NUM_LINES_TRAIN_CLEAN_360 = 104014 +NUM_LINES_TRAIN_OTHER_500 = 148688 + parser = argparse.ArgumentParser( description='Downloads and prepare LibriSpeech dataset.') parser.add_argument( @@ -44,6 +56,13 @@ parser.add_argument( default="manifest.libri", type=str, help="Filepath prefix for output manifests. (default: %(default)s)") +parser.add_argument( + "--full_download", + default="True", + type=distutils.util.strtobool, + help="Download all datasets for Librispeech." + " If False, only download a minimal requirement (test-clean, dev-clean" + " train-clean-100). (default: %(default)s)") args = parser.parse_args() @@ -57,7 +76,10 @@ def download(url, md5sum, target_dir): print("Downloading %s ..." % url) wget.download(url, target_dir) print("\nMD5 Chesksum %s ..." % filepath) - assert md5file(filepath) == md5sum, "MD5 checksum failed." + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) return filepath @@ -69,7 +91,6 @@ def unpack(filepath, target_dir): tar = tarfile.open(filepath) tar.extractall(target_dir) tar.close() - return target_dir def create_manifest(data_dir, manifest_path): @@ -83,7 +104,7 @@ def create_manifest(data_dir, manifest_path): """ print("Creating manifest %s ..." % manifest_path) json_lines = [] - for subfolder, _, filelist in os.walk(data_dir): + for subfolder, _, filelist in sorted(os.walk(data_dir)): text_filelist = [ filename for filename in filelist if filename.endswith('trans.txt') ] @@ -107,13 +128,28 @@ def create_manifest(data_dir, manifest_path): out_file.write(line + '\n') -def prepare_dataset(url, md5sum, target_dir, manifest_path): +def verify_file_line_number(filepath, num_lines): + with open(filepath, 'r') as file: + return len(file.readlines()) == num_lines + + +def prepare_dataset(url, md5sum, target_dir, manifest_path, num_lines): """ Download, unpack and create summmary manifest file. """ + # download filepath = download(url, md5sum, target_dir) - unpacked_dir = unpack(filepath, target_dir) - create_manifest(unpacked_dir, manifest_path) + # unpack + if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + unpack(filepath, target_dir) + else: + print("Unpacked data exists, skip unpacking.") + # create manifest and verify line number + create_manifest(target_dir, manifest_path) + if not verify_file_line_number(manifest_path, num_lines): + raise RuntimeError("Manifest line number check failed. " + "Please remove directory and try running the script " + "again.") def main(): @@ -121,17 +157,45 @@ def main(): url=URL_TEST_CLEAN, md5sum=MD5_TEST_CLEAN, target_dir=os.path.join(args.target_dir, "test-clean"), - manifest_path=args.manifest_prefix + ".test-clean") + manifest_path=args.manifest_prefix + ".test-clean", + num_lines=NUM_LINES_TEST_CLEAN) prepare_dataset( url=URL_DEV_CLEAN, md5sum=MD5_DEV_CLEAN, target_dir=os.path.join(args.target_dir, "dev-clean"), - manifest_path=args.manifest_prefix + ".dev-clean") + manifest_path=args.manifest_prefix + ".dev-clean", + num_lines=NUM_LINES_DEV_CLEAN) prepare_dataset( url=URL_TRAIN_CLEAN_100, md5sum=MD5_TRAIN_CLEAN_100, target_dir=os.path.join(args.target_dir, "train-clean-100"), - manifest_path=args.manifest_prefix + ".train-clean-100") + manifest_path=args.manifest_prefix + ".train-clean-100", + num_lines=NUM_LINES_TRAIN_CLEAN_100) + if args.full_download: + prepare_dataset( + url=URL_TEST_OTHER, + md5sum=MD5_TEST_OTHER, + target_dir=os.path.join(args.target_dir, "test-other"), + manifest_path=args.manifest_prefix + ".test-other", + num_lines=NUM_LINES_TEST_OTHER) + prepare_dataset( + url=URL_DEV_OTHER, + md5sum=MD5_DEV_OTHER, + target_dir=os.path.join(args.target_dir, "dev-other"), + manifest_path=args.manifest_prefix + ".dev-other", + num_lines=NUM_LINES_DEV_OTHER) + prepare_dataset( + url=URL_TRAIN_CLEAN_360, + md5sum=MD5_TRAIN_CLEAN_360, + target_dir=os.path.join(args.target_dir, "train-clean-360"), + manifest_path=args.manifest_prefix + ".train-clean-360", + num_lines=NUM_LINES_TRAIN_CLEAN_360) + prepare_dataset( + url=URL_TRAIN_OTHER_500, + md5sum=MD5_TRAIN_OTHER_500, + target_dir=os.path.join(args.target_dir, "train-other-500"), + manifest_path=args.manifest_prefix + ".train-other-500", + num_lines=NUM_LINES_TRAIN_OTHER_500) if __name__ == '__main__': From f49eab5fec2b478a7822f6459e4a8e7023f65df1 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 7 Jun 2017 19:11:21 +0800 Subject: [PATCH 3/4] Change assert to exception raising. --- train.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/train.py b/train.py index 14c7cf637..89ab23c68 100644 --- a/train.py +++ b/train.py @@ -81,9 +81,11 @@ parser.add_argument( help="Vocabulary filepath. (default: %(default)s)") parser.add_argument( "--init_model_path", - default='models/params.tar.gz', + default=None, type=str, - help="Model path for initialization. (default: %(default)s)") + help="If set None, the training will start from scratch. " + "Otherwise, the training will resume from " + "the existing model of this path. (default: %(default)s)") args = parser.parse_args() @@ -124,7 +126,8 @@ def train(): if args.init_model_path is None: parameters = paddle.parameters.create(cost) else: - assert os.path.isfile(args.init_model_path), "Invalid model." + if not os.path.isfile(args.init_model_path): + raise IOError("Invalid model!") parameters = paddle.parameters.Parameters.from_tar( gzip.open(args.init_model_path)) optimizer = paddle.optimizer.Adam( From 06e9f713899f2118c08753bfe40bd2abf4d152b2 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 8 Jun 2017 22:20:11 +0800 Subject: [PATCH 4/4] Remove manifest's line number check from librispeech.py and update README.md. --- README.md | 4 +++ data/librispeech.py | 69 ++++++++++++++------------------------------- 2 files changed, 25 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 403511d58..7a372e9be 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,10 @@ cat manifest.libri.train-* > manifest.libri.train-all cd .. ``` +After running librispeech.py, we have several "manifest" json files named with a prefix `manifest.libri.`. A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcription text, audio duration) of each audio file within the data set, in json format. + +By `cat manifest.libri.train-* > manifest.libri.train-all`, we simply merge the three seperate sample sets of LibriSpeech (train-clean-100, train-clean-360, train-other-500) into one training set. This is a simple way for merging different data sets. + More help for arguments: ``` diff --git a/data/librispeech.py b/data/librispeech.py index 8bc33575e..653caa926 100644 --- a/data/librispeech.py +++ b/data/librispeech.py @@ -1,10 +1,9 @@ """ - Download, unpack and create manifest file for the Librespeech dataset. + Download, unpack and create manifest json files for the Librespeech dataset. - A manifest file is a dataset summarization, with each line a json format - string containing meta data for one audio clip, including its filepath, - transcription string, and duration. It serves as a unified interface for - different data sets. + A manifest is a json file summarizing filelist in a data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file in the data set. """ import paddle.v2 as paddle @@ -36,14 +35,6 @@ MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" -NUM_LINES_TEST_CLEAN = 2620 -NUM_LINES_TEST_OTHER = 2939 -NUM_LINES_DEV_CLEAN = 2703 -NUM_LINES_DEV_OTHER = 2864 -NUM_LINES_TRAIN_CLEAN_100 = 28539 -NUM_LINES_TRAIN_CLEAN_360 = 104014 -NUM_LINES_TRAIN_OTHER_500 = 148688 - parser = argparse.ArgumentParser( description='Downloads and prepare LibriSpeech dataset.') parser.add_argument( @@ -95,12 +86,9 @@ def unpack(filepath, target_dir): def create_manifest(data_dir, manifest_path): """ - Create a manifest file summarizing the dataset (list of filepath and meta - data). - - Each line of the manifest contains one audio clip filepath, its - transcription text string, and its duration. Manifest file servers as a - unified interfance to organize data sets. + Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. """ print("Creating manifest %s ..." % manifest_path) json_lines = [] @@ -128,28 +116,20 @@ def create_manifest(data_dir, manifest_path): out_file.write(line + '\n') -def verify_file_line_number(filepath, num_lines): - with open(filepath, 'r') as file: - return len(file.readlines()) == num_lines - - -def prepare_dataset(url, md5sum, target_dir, manifest_path, num_lines): +def prepare_dataset(url, md5sum, target_dir, manifest_path): """ Download, unpack and create summmary manifest file. """ - # download - filepath = download(url, md5sum, target_dir) - # unpack if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + # download + filepath = download(url, md5sum, target_dir) + # unpack unpack(filepath, target_dir) else: - print("Unpacked data exists, skip unpacking.") - # create manifest and verify line number + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file create_manifest(target_dir, manifest_path) - if not verify_file_line_number(manifest_path, num_lines): - raise RuntimeError("Manifest line number check failed. " - "Please remove directory and try running the script " - "again.") def main(): @@ -157,45 +137,38 @@ def main(): url=URL_TEST_CLEAN, md5sum=MD5_TEST_CLEAN, target_dir=os.path.join(args.target_dir, "test-clean"), - manifest_path=args.manifest_prefix + ".test-clean", - num_lines=NUM_LINES_TEST_CLEAN) + manifest_path=args.manifest_prefix + ".test-clean") prepare_dataset( url=URL_DEV_CLEAN, md5sum=MD5_DEV_CLEAN, target_dir=os.path.join(args.target_dir, "dev-clean"), - manifest_path=args.manifest_prefix + ".dev-clean", - num_lines=NUM_LINES_DEV_CLEAN) + manifest_path=args.manifest_prefix + ".dev-clean") prepare_dataset( url=URL_TRAIN_CLEAN_100, md5sum=MD5_TRAIN_CLEAN_100, target_dir=os.path.join(args.target_dir, "train-clean-100"), - manifest_path=args.manifest_prefix + ".train-clean-100", - num_lines=NUM_LINES_TRAIN_CLEAN_100) + manifest_path=args.manifest_prefix + ".train-clean-100") if args.full_download: prepare_dataset( url=URL_TEST_OTHER, md5sum=MD5_TEST_OTHER, target_dir=os.path.join(args.target_dir, "test-other"), - manifest_path=args.manifest_prefix + ".test-other", - num_lines=NUM_LINES_TEST_OTHER) + manifest_path=args.manifest_prefix + ".test-other") prepare_dataset( url=URL_DEV_OTHER, md5sum=MD5_DEV_OTHER, target_dir=os.path.join(args.target_dir, "dev-other"), - manifest_path=args.manifest_prefix + ".dev-other", - num_lines=NUM_LINES_DEV_OTHER) + manifest_path=args.manifest_prefix + ".dev-other") prepare_dataset( url=URL_TRAIN_CLEAN_360, md5sum=MD5_TRAIN_CLEAN_360, target_dir=os.path.join(args.target_dir, "train-clean-360"), - manifest_path=args.manifest_prefix + ".train-clean-360", - num_lines=NUM_LINES_TRAIN_CLEAN_360) + manifest_path=args.manifest_prefix + ".train-clean-360") prepare_dataset( url=URL_TRAIN_OTHER_500, md5sum=MD5_TRAIN_OTHER_500, target_dir=os.path.join(args.target_dir, "train-other-500"), - manifest_path=args.manifest_prefix + ".train-other-500", - num_lines=NUM_LINES_TRAIN_OTHER_500) + manifest_path=args.manifest_prefix + ".train-other-500") if __name__ == '__main__':