From e9a42044f578084901b86bd1b6fc9bb3dec1d61f Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 19 Sep 2017 13:02:31 +0800 Subject: [PATCH] Add data preparing for Aishell. --- data/aishell/aishell.py | 109 +++++++++++++++++++++++++++++++ data/librispeech/librispeech.py | 4 +- data_utils/utility.py | 1 + examples/aishell/run_data.sh | 42 ++++++++++++ examples/librispeech/run_data.sh | 2 +- 5 files changed, 155 insertions(+), 3 deletions(-) create mode 100644 data/aishell/aishell.py create mode 100644 examples/aishell/run_data.sh diff --git a/data/aishell/aishell.py b/data/aishell/aishell.py new file mode 100644 index 00000000..17786b5d --- /dev/null +++ b/data/aishell/aishell.py @@ -0,0 +1,109 @@ +"""Prepare Aishell mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import codecs +import soundfile +import json +import argparse +from data_utils.utility import download, unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'http://www.openslr.org/resources/33' +DATA_URL = URL_ROOT + '/data_aishell.tgz' +MD5_DATA = '2f494334227864a8a8fec932999db9d8' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/Aishell", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + transcript_path = os.path.join(data_dir, 'transcript', + 'aishell_transcript_v0.8.txt') + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': continue + audio_id, text = line.split(' ', 1) + # remove withespace + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + data_types = ['train', 'dev', 'test'] + for type in data_types: + audio_dir = os.path.join(data_dir, 'wav', type) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + audio_path = os.path.join(subfolder, fname) + audio_id = fname[:-4] + # if no transcription for audio then skipped + if audio_id not in transcript_dict: + continue + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + text = transcript_dict[audio_id] + json_lines.append( + json.dumps( + { + 'audio_filepath': audio_path, + 'duration': duration, + 'text': text + }, + ensure_ascii=False)) + manifest_path = manifest_path_prefix + '.' + type + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, 'data_aishell') + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + # unpack all audio tar files + audio_dir = os.path.join(data_dir, 'wav') + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for ftar in filelist: + unpack(os.path.join(subfolder, ftar), subfolder, True) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + create_manifest(data_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix) + + +if __name__ == '__main__': + main() diff --git a/data/librispeech/librispeech.py b/data/librispeech/librispeech.py index 79cc3de8..9a8e1c28 100644 --- a/data/librispeech/librispeech.py +++ b/data/librispeech/librispeech.py @@ -16,7 +16,6 @@ import argparse import soundfile import json import codecs -from paddle.v2.dataset.common import md5file from data_utils.utility import download, unpack URL_ROOT = "http://www.openslr.org/resources/12" @@ -104,7 +103,8 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path): def main(): - args.target_dir = os.path.expanduser(args.target_dir) + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) prepare_dataset( url=URL_TEST_CLEAN, diff --git a/data_utils/utility.py b/data_utils/utility.py index e1e3b55e..da7b66ef 100644 --- a/data_utils/utility.py +++ b/data_utils/utility.py @@ -7,6 +7,7 @@ import json import codecs import os import tarfile +from paddle.v2.dataset.common import md5file def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): diff --git a/examples/aishell/run_data.sh b/examples/aishell/run_data.sh new file mode 100644 index 00000000..db27c530 --- /dev/null +++ b/examples/aishell/run_data.sh @@ -0,0 +1,42 @@ +#! /usr/bin/env bash + +pushd ../.. > /dev/null + +# download data, generate manifests +PYTHONPATH=.:$PYTHONPATH python data/aishell/aishell.py \ +--manifest_prefix='data/aishell/manifest' \ +--target_dir='~/.cache/paddle/dataset/speech/Aishell' + +if [ $? -ne 0 ]; then + echo "Prepare Aishell failed. Terminated." + exit 1 +fi + + +# build vocabulary +python tools/build_vocab.py \ +--count_threshold=0 \ +--vocab_path='data/aishell/vocab.txt' \ +--manifest_paths='data/aishell/manifest.train' + +if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 +fi + + +# compute mean and stddev for normalizer +python tools/compute_mean_std.py \ +--manifest_path='data/aishell/manifest.train' \ +--num_samples=2000 \ +--specgram_type='linear' \ +--output_path='data/aishell/mean_std.npz' + +if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 +fi + + +echo "Aishell data preparation done." +exit 0 diff --git a/examples/librispeech/run_data.sh b/examples/librispeech/run_data.sh index bdd5abb5..957416f4 100644 --- a/examples/librispeech/run_data.sh +++ b/examples/librispeech/run_data.sh @@ -3,7 +3,7 @@ pushd ../.. > /dev/null # download data, generate manifests -python data/librispeech/librispeech.py \ +PYTHONPATH=.:$PYPYTHONPATH python data/librispeech/librispeech.py \ --manifest_prefix='data/librispeech/manifest' \ --target_dir='~/.cache/paddle/dataset/speech/Libri' \ --full_download='True'