Remove manifest's line number check from librispeech.py and update README.md.

9 years ago · 06e9f71389
parent d3eeb7fd76
commit 06e9f71389
2 changed files with 25 additions and 48 deletions
--- a/README.md
+++ b/README.md
@ -22,6 +22,10 @@ cat manifest.libri.train-* > manifest.libri.train-all
 cd ..
 ```

+After running librispeech.py, we have several "manifest" json files named with a prefix `manifest.libri.`. A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcription text, audio duration) of each audio file within the data set, in json format.
+
+By `cat manifest.libri.train-* > manifest.libri.train-all`, we simply merge the three seperate sample sets of LibriSpeech (train-clean-100, train-clean-360, train-other-500) into one training set. This is a simple way for merging different data sets.
+
 More help for arguments:

 ```
--- a/data/librispeech.py
+++ b/data/librispeech.py
@ -1,10 +1,9 @@
 """
-    Download, unpack and create manifest file for the Librespeech dataset.
+    Download, unpack and create manifest json files for the Librespeech dataset.

-    A manifest file is a dataset summarization, with each line a json format
-    string containing meta data for one audio clip, including its filepath,
-    transcription string, and duration. It serves as a unified interface for
-    different data sets.
+    A manifest is a json file summarizing filelist in a data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file in the data set.
 """

 import paddle.v2 as paddle
@ -36,14 +35,6 @@ MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
 MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
 MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"

-NUM_LINES_TEST_CLEAN = 2620
-NUM_LINES_TEST_OTHER = 2939
-NUM_LINES_DEV_CLEAN = 2703
-NUM_LINES_DEV_OTHER = 2864
-NUM_LINES_TRAIN_CLEAN_100 = 28539
-NUM_LINES_TRAIN_CLEAN_360 = 104014
-NUM_LINES_TRAIN_OTHER_500 = 148688
-
 parser = argparse.ArgumentParser(
    description='Downloads and prepare LibriSpeech dataset.')
 parser.add_argument(
@ -95,12 +86,9 @@ def unpack(filepath, target_dir):

 def create_manifest(data_dir, manifest_path):
    """
-    Create a manifest file summarizing the dataset (list of filepath and meta
-    data).
-
-    Each line of the manifest contains one audio clip filepath, its
-    transcription text string, and its duration. Manifest file servers as a
-    unified interfance to organize data sets.
+    Create a manifest json file summarizing the data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file within the data set.
    """
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
@ -128,28 +116,20 @@ def create_manifest(data_dir, manifest_path):
            out_file.write(line + '\n')


-def verify_file_line_number(filepath, num_lines):
-    with open(filepath, 'r') as file:
-        return len(file.readlines()) == num_lines
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path, num_lines):
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
    """
    Download, unpack and create summmary manifest file.
    """
-    # download
-    filepath = download(url, md5sum, target_dir)
-    # unpack
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
+        # download
+        filepath = download(url, md5sum, target_dir)
+        # unpack
        unpack(filepath, target_dir)
    else:
-        print("Unpacked data exists, skip unpacking.")
-    # create manifest and verify line number
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    # create manifest json file
    create_manifest(target_dir, manifest_path)
-    if not verify_file_line_number(manifest_path, num_lines):
-        raise RuntimeError("Manifest line number check failed. "
-                           "Please remove directory and try running the script "
-                           "again.")


 def main():
@ -157,45 +137,38 @@ def main():
        url=URL_TEST_CLEAN,
        md5sum=MD5_TEST_CLEAN,
        target_dir=os.path.join(args.target_dir, "test-clean"),
-        manifest_path=args.manifest_prefix + ".test-clean",
-        num_lines=NUM_LINES_TEST_CLEAN)
+        manifest_path=args.manifest_prefix + ".test-clean")
    prepare_dataset(
        url=URL_DEV_CLEAN,
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
-        manifest_path=args.manifest_prefix + ".dev-clean",
-        num_lines=NUM_LINES_DEV_CLEAN)
+        manifest_path=args.manifest_prefix + ".dev-clean")
    prepare_dataset(
        url=URL_TRAIN_CLEAN_100,
        md5sum=MD5_TRAIN_CLEAN_100,
        target_dir=os.path.join(args.target_dir, "train-clean-100"),
-        manifest_path=args.manifest_prefix + ".train-clean-100",
-        num_lines=NUM_LINES_TRAIN_CLEAN_100)
+        manifest_path=args.manifest_prefix + ".train-clean-100")
    if args.full_download:
        prepare_dataset(
            url=URL_TEST_OTHER,
            md5sum=MD5_TEST_OTHER,
            target_dir=os.path.join(args.target_dir, "test-other"),
-            manifest_path=args.manifest_prefix + ".test-other",
-            num_lines=NUM_LINES_TEST_OTHER)
+            manifest_path=args.manifest_prefix + ".test-other")
        prepare_dataset(
            url=URL_DEV_OTHER,
            md5sum=MD5_DEV_OTHER,
            target_dir=os.path.join(args.target_dir, "dev-other"),
-            manifest_path=args.manifest_prefix + ".dev-other",
-            num_lines=NUM_LINES_DEV_OTHER)
+            manifest_path=args.manifest_prefix + ".dev-other")
        prepare_dataset(
            url=URL_TRAIN_CLEAN_360,
            md5sum=MD5_TRAIN_CLEAN_360,
            target_dir=os.path.join(args.target_dir, "train-clean-360"),
-            manifest_path=args.manifest_prefix + ".train-clean-360",
-            num_lines=NUM_LINES_TRAIN_CLEAN_360)
+            manifest_path=args.manifest_prefix + ".train-clean-360")
        prepare_dataset(
            url=URL_TRAIN_OTHER_500,
            md5sum=MD5_TRAIN_OTHER_500,
            target_dir=os.path.join(args.target_dir, "train-other-500"),
-            manifest_path=args.manifest_prefix + ".train-other-500",
-            num_lines=NUM_LINES_TRAIN_OTHER_500)
+            manifest_path=args.manifest_prefix + ".train-other-500")


 if __name__ == '__main__':