Add data preparing for Aishell.

8 years ago · e9a42044f5
parent 3bed29ddda
commit e9a42044f5
5 changed files with 155 additions and 3 deletions
--- a/data/aishell/aishell.py
+++ b/data/aishell/aishell.py
@ -0,0 +1,109 @@
 """Prepare Aishell mandarin dataset
 Download, unpack and create manifest files.
 Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os
 import codecs
 import soundfile
 import json
 import argparse
 from data_utils.utility import download, unpack
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 URL_ROOT = 'http://www.openslr.org/resources/33'
 DATA_URL = URL_ROOT + '/data_aishell.tgz'
 MD5_DATA = '2f494334227864a8a8fec932999db9d8'
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/Aishell",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
 args = parser.parse_args()
 def create_manifest(data_dir, manifest_path_prefix):
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []
    transcript_path = os.path.join(data_dir, 'transcript',
                                   'aishell_transcript_v0.8.txt')
    transcript_dict = {}
    for line in codecs.open(transcript_path, 'r', 'utf-8'):
        line = line.strip()
        if line == '': continue
        audio_id, text = line.split(' ', 1)
        # remove withespace
        text = ''.join(text.split())
        transcript_dict[audio_id] = text
    data_types = ['train', 'dev', 'test']
    for type in data_types:
        audio_dir = os.path.join(data_dir, 'wav', type)
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for fname in filelist:
                audio_path = os.path.join(subfolder, fname)
                audio_id = fname[:-4]
                # if no transcription for audio then skipped
                if audio_id not in transcript_dict:
                    continue
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                text = transcript_dict[audio_id]
                json_lines.append(
                    json.dumps(
                        {
                            'audio_filepath': audio_path,
                            'duration': duration,
                            'text': text
                        },
                        ensure_ascii=False))
        manifest_path = manifest_path_prefix + '.' + type
        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
            for line in json_lines:
                fout.write(line + '\n')
 def prepare_dataset(url, md5sum, target_dir, manifest_path):
    """Download, unpack and create manifest file."""
    data_dir = os.path.join(target_dir, 'data_aishell')
    if not os.path.exists(data_dir):
        filepath = download(url, md5sum, target_dir)
        unpack(filepath, target_dir)
        # unpack all audio tar files
        audio_dir = os.path.join(data_dir, 'wav')
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for ftar in filelist:
                unpack(os.path.join(subfolder, ftar), subfolder, True)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    create_manifest(data_dir, manifest_path)
 def main():
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(
        url=DATA_URL,
        md5sum=MD5_DATA,
        target_dir=args.target_dir,
        manifest_path=args.manifest_prefix)
 if __name__ == '__main__':
    main()
--- a/data/librispeech/librispeech.py
+++ b/data/librispeech/librispeech.py
@ -16,7 +16,6 @@ import argparse
 import soundfile
 import json
 import codecs
 from paddle.v2.dataset.common import md5file
 from data_utils.utility import download, unpack
 URL_ROOT = "http://www.openslr.org/resources/12"
@ -104,7 +103,8 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path):
 def main():
-    args.target_dir = os.path.expanduser(args.target_dir)
+    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(
        url=URL_TEST_CLEAN,
--- a/data_utils/utility.py
+++ b/data_utils/utility.py
@ -7,6 +7,7 @@ import json
 import codecs
 import os
 import tarfile
 from paddle.v2.dataset.common import md5file
 def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
--- a/examples/aishell/run_data.sh
+++ b/examples/aishell/run_data.sh
@ -0,0 +1,42 @@
 #! /usr/bin/env bash
 pushd ../.. > /dev/null
 # download data, generate manifests
 PYTHONPATH=.:$PYTHONPATH python data/aishell/aishell.py \
 --manifest_prefix='data/aishell/manifest' \
 --target_dir='~/.cache/paddle/dataset/speech/Aishell'
 if [ $? -ne 0 ]; then
    echo "Prepare Aishell failed. Terminated."
    exit 1
 fi
 # build vocabulary
 python tools/build_vocab.py \
 --count_threshold=0 \
 --vocab_path='data/aishell/vocab.txt' \
 --manifest_paths='data/aishell/manifest.train'
 if [ $? -ne 0 ]; then
    echo "Build vocabulary failed. Terminated."
    exit 1
 fi
 # compute mean and stddev for normalizer
 python tools/compute_mean_std.py \
 --manifest_path='data/aishell/manifest.train' \
 --num_samples=2000 \
 --specgram_type='linear' \
 --output_path='data/aishell/mean_std.npz'
 if [ $? -ne 0 ]; then
    echo "Compute mean and stddev failed. Terminated."
    exit 1
 fi
 echo "Aishell data preparation done."
 exit 0
--- a/examples/librispeech/run_data.sh
+++ b/examples/librispeech/run_data.sh
@ -3,7 +3,7 @@
 pushd ../.. > /dev/null
 # download data, generate manifests
-python data/librispeech/librispeech.py \
+PYTHONPATH=.:$PYPYTHONPATH python data/librispeech/librispeech.py \
 --manifest_prefix='data/librispeech/manifest' \
 --target_dir='~/.cache/paddle/dataset/speech/Libri' \
 --full_download='True'