You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/librispeech.py

107 lines
3.3 KiB

"""
Download, unpack and create manifest for Librespeech dataset.
Manifest is a json file with each line containing one audio clip filepath,
its transcription text string, and its duration. It servers as a unified
interfance to organize different data sets.
"""
import paddle.v2 as paddle
import os
import wget
import tarfile
import argparse
import soundfile
import json
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
URL_TEST = "http://www.openslr.org/resources/12/test-clean.tar.gz"
URL_DEV = "http://www.openslr.org/resources/12/dev-clean.tar.gz"
URL_TRAIN = "http://www.openslr.org/resources/12/train-clean-100.tar.gz"
parser = argparse.ArgumentParser(
description='Downloads and prepare LibriSpeech dataset.')
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/Libri",
type=str,
help="Directory to save the dataset.")
parser.add_argument(
"--manifest",
default="./libri.manifest",
type=str,
help="Filepath prefix for output manifests.")
args = parser.parse_args()
def download(url, target_dir):
if not os.path.exists(target_dir):
os.makedirs(target_dir)
filepath = os.path.join(target_dir, url.split("/")[-1])
if not os.path.exists(filepath):
print("Downloading %s ..." % url)
wget.download(url, target_dir)
print("")
return filepath
def unpack(filepath, target_dir):
print("Unpacking %s ..." % filepath)
tar = tarfile.open(filepath)
tar.extractall(target_dir)
tar.close()
return target_dir
def create_manifest(data_dir, manifest_path):
print("Creating manifest %s ..." % manifest_path)
json_lines = []
for subfolder, _, filelist in os.walk(data_dir):
text_filelist = [
filename for filename in filelist if filename.endswith('trans.txt')
]
if len(text_filelist) > 0:
text_filepath = os.path.join(data_dir, subfolder, text_filelist[0])
for line in open(text_filepath):
segments = line.strip().split()
text = ' '.join(segments[1:]).lower()
audio_filepath = os.path.join(data_dir, subfolder,
segments[0] + '.flac')
audio_data, samplerate = soundfile.read(audio_filepath)
duration = float(len(audio_data)) / samplerate
json_lines.append(
json.dumps({
'audio_filepath': audio_filepath,
'duration': duration,
'text': text
}))
with open(manifest_path, 'w') as out_file:
for line in json_lines:
out_file.write(line + '\n')
def prepare_dataset(url, target_dir, manifest_path):
filepath = download(url, target_dir)
unpacked_dir = unpack(filepath, target_dir)
create_manifest(unpacked_dir, manifest_path)
def main():
prepare_dataset(
url=URL_TEST,
target_dir=os.path.join(args.target_dir),
manifest_path=args.manifest + ".test")
prepare_dataset(
url=URL_DEV,
target_dir=os.path.join(args.target_dir),
manifest_path=args.manifest + ".dev")
prepare_dataset(
url=URL_TRAIN,
target_dir=os.path.join(args.target_dir),
manifest_path=args.manifest + ".train")
if __name__ == '__main__':
main()