diff --git a/data/aishell/aishell.py b/data/aishell/aishell.py new file mode 100644 index 00000000..17786b5d --- /dev/null +++ b/data/aishell/aishell.py @@ -0,0 +1,109 @@ +"""Prepare Aishell mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import codecs +import soundfile +import json +import argparse +from data_utils.utility import download, unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'http://www.openslr.org/resources/33' +DATA_URL = URL_ROOT + '/data_aishell.tgz' +MD5_DATA = '2f494334227864a8a8fec932999db9d8' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/Aishell", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + transcript_path = os.path.join(data_dir, 'transcript', + 'aishell_transcript_v0.8.txt') + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': continue + audio_id, text = line.split(' ', 1) + # remove withespace + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + data_types = ['train', 'dev', 'test'] + for type in data_types: + audio_dir = os.path.join(data_dir, 'wav', type) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + audio_path = os.path.join(subfolder, fname) + audio_id = fname[:-4] + # if no transcription for audio then skipped + if audio_id not in transcript_dict: + continue + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + text = transcript_dict[audio_id] + json_lines.append( + json.dumps( + { + 'audio_filepath': audio_path, + 'duration': duration, + 'text': text + }, + ensure_ascii=False)) + manifest_path = manifest_path_prefix + '.' + type + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, 'data_aishell') + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + # unpack all audio tar files + audio_dir = os.path.join(data_dir, 'wav') + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for ftar in filelist: + unpack(os.path.join(subfolder, ftar), subfolder, True) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + create_manifest(data_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix) + + +if __name__ == '__main__': + main() diff --git a/data/librispeech/librispeech.py b/data/librispeech/librispeech.py index 8dce359a..9a8e1c28 100644 --- a/data/librispeech/librispeech.py +++ b/data/librispeech/librispeech.py @@ -12,12 +12,11 @@ from __future__ import print_function import distutils.util import os import sys -import tarfile import argparse import soundfile import json import codecs -from paddle.v2.dataset.common import md5file +from data_utils.utility import download, unpack URL_ROOT = "http://www.openslr.org/resources/12" URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz" @@ -57,31 +56,6 @@ parser.add_argument( args = parser.parse_args() -def download(url, md5sum, target_dir): - """Download file from url to target_dir, and check md5sum. - """ - if not os.path.exists(target_dir): os.makedirs(target_dir) - filepath = os.path.join(target_dir, url.split("/")[-1]) - if not (os.path.exists(filepath) and md5file(filepath) == md5sum): - print("Downloading %s ..." % url) - os.system("wget -c " + url + " -P " + target_dir) - print("\nMD5 Chesksum %s ..." % filepath) - if not md5file(filepath) == md5sum: - raise RuntimeError("MD5 checksum failed.") - else: - print("File exists, skip downloading. (%s)" % filepath) - return filepath - - -def unpack(filepath, target_dir): - """Unpack the file to the target_dir. - """ - print("Unpacking %s ..." % filepath) - tar = tarfile.open(filepath) - tar.extractall(target_dir) - tar.close() - - def create_manifest(data_dir, manifest_path): """Create a manifest json file summarizing the data set, with each line containing the meta data (i.e. audio filepath, transcription text, audio @@ -129,7 +103,8 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path): def main(): - args.target_dir = os.path.expanduser(args.target_dir) + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) prepare_dataset( url=URL_TEST_CLEAN, diff --git a/data_utils/utility.py b/data_utils/utility.py index f970ff55..da7b66ef 100644 --- a/data_utils/utility.py +++ b/data_utils/utility.py @@ -5,6 +5,9 @@ from __future__ import print_function import json import codecs +import os +import tarfile +from paddle.v2.dataset.common import md5file def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): @@ -33,3 +36,28 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): json_data["duration"] >= min_duration): manifest.append(json_data) return manifest + + +def download(url, md5sum, target_dir): + """Download file from url to target_dir, and check md5sum.""" + if not os.path.exists(target_dir): os.makedirs(target_dir) + filepath = os.path.join(target_dir, url.split("/")[-1]) + if not (os.path.exists(filepath) and md5file(filepath) == md5sum): + print("Downloading %s ..." % url) + os.system("wget -c " + url + " -P " + target_dir) + print("\nMD5 Chesksum %s ..." % filepath) + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) + return filepath + + +def unpack(filepath, target_dir, rm_tar=False): + """Unpack the file to the target_dir.""" + print("Unpacking %s ..." % filepath) + tar = tarfile.open(filepath) + tar.extractall(target_dir) + tar.close() + if rm_tar == True: + os.remove(filepath) diff --git a/examples/aishell/run_data.sh b/examples/aishell/run_data.sh new file mode 100644 index 00000000..db27c530 --- /dev/null +++ b/examples/aishell/run_data.sh @@ -0,0 +1,42 @@ +#! /usr/bin/env bash + +pushd ../.. > /dev/null + +# download data, generate manifests +PYTHONPATH=.:$PYTHONPATH python data/aishell/aishell.py \ +--manifest_prefix='data/aishell/manifest' \ +--target_dir='~/.cache/paddle/dataset/speech/Aishell' + +if [ $? -ne 0 ]; then + echo "Prepare Aishell failed. Terminated." + exit 1 +fi + + +# build vocabulary +python tools/build_vocab.py \ +--count_threshold=0 \ +--vocab_path='data/aishell/vocab.txt' \ +--manifest_paths='data/aishell/manifest.train' + +if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 +fi + + +# compute mean and stddev for normalizer +python tools/compute_mean_std.py \ +--manifest_path='data/aishell/manifest.train' \ +--num_samples=2000 \ +--specgram_type='linear' \ +--output_path='data/aishell/mean_std.npz' + +if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 +fi + + +echo "Aishell data preparation done." +exit 0 diff --git a/examples/librispeech/run_data.sh b/examples/librispeech/run_data.sh index bdd5abb5..957416f4 100644 --- a/examples/librispeech/run_data.sh +++ b/examples/librispeech/run_data.sh @@ -3,7 +3,7 @@ pushd ../.. > /dev/null # download data, generate manifests -python data/librispeech/librispeech.py \ +PYTHONPATH=.:$PYPYTHONPATH python data/librispeech/librispeech.py \ --manifest_prefix='data/librispeech/manifest' \ --target_dir='~/.cache/paddle/dataset/speech/Libri' \ --full_download='True'