PaddleSpeech/librispeech.py

"""
   Download, unpack and create manifest for Librespeech dataset.

   Manifest is a json file with each line containing one audio clip filepath,
   its transcription text string, and its duration. It servers as a unified
   interfance to organize different data sets.
"""

import paddle.v2 as paddle
import os
import wget
import tarfile
import argparse
import soundfile
import json

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

URL_TEST = "http://www.openslr.org/resources/12/test-clean.tar.gz"
URL_DEV = "http://www.openslr.org/resources/12/dev-clean.tar.gz"
URL_TRAIN = "http://www.openslr.org/resources/12/train-clean-100.tar.gz"

parser = argparse.ArgumentParser(
    description='Downloads and prepare LibriSpeech dataset.')
parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/Libri",
    type=str,
    help="Directory to save the dataset.")
parser.add_argument(
    "--manifest",
    default="./libri.manifest",
    type=str,
    help="Filepath prefix for output manifests.")
args = parser.parse_args()


def download(url, target_dir):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    filepath = os.path.join(target_dir, url.split("/")[-1])
    if not os.path.exists(filepath):
        print("Downloading %s ..." % url)
        wget.download(url, target_dir)
        print("")
    return filepath


def unpack(filepath, target_dir):
    print("Unpacking %s ..." % filepath)
    tar = tarfile.open(filepath)
    tar.extractall(target_dir)
    tar.close()
    return target_dir


def create_manifest(data_dir, manifest_path):
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
    for subfolder, _, filelist in os.walk(data_dir):
        text_filelist = [
            filename for filename in filelist if filename.endswith('trans.txt')
        ]
        if len(text_filelist) > 0:
            text_filepath = os.path.join(data_dir, subfolder, text_filelist[0])
            for line in open(text_filepath):
                segments = line.strip().split()
                text = ' '.join(segments[1:]).lower()
                audio_filepath = os.path.join(data_dir, subfolder,
                                              segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
                json_lines.append(
                    json.dumps({
                        'audio_filepath': audio_filepath,
                        'duration': duration,
                        'text': text
                    }))
    with open(manifest_path, 'w') as out_file:
        for line in json_lines:
            out_file.write(line + '\n')


def prepare_dataset(url, target_dir, manifest_path):
    filepath = download(url, target_dir)
    unpacked_dir = unpack(filepath, target_dir)
    create_manifest(unpacked_dir, manifest_path)


def main():
    prepare_dataset(
        url=URL_TEST,
        target_dir=os.path.join(args.target_dir),
        manifest_path=args.manifest + ".test")
    prepare_dataset(
        url=URL_DEV,
        target_dir=os.path.join(args.target_dir),
        manifest_path=args.manifest + ".dev")
    prepare_dataset(
        url=URL_TRAIN,
        target_dir=os.path.join(args.target_dir),
        manifest_path=args.manifest + ".train")


if __name__ == '__main__':
    main()