Add script for VoxForge data preparation.

8 years ago · abbfa43b22
parent b7d9479021
commit abbfa43b22
4 changed files with 260 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -506,6 +506,8 @@ VoxForge European       |   31.21           |   20.47
 VoxForge Indian         |   56.79           |   28.15
 Baidu Internal Testset  |   47.73           |   8.92
 For reproducing results on VoxForge data, we provide a script to generate VoxForge dialect manifest files. Please go to ```data/voxforge``` and execute ```sh run_data.sh``` to get VoxForge dialect manifest files. Notice that VoxForge data may keep updated and the generated manifest files may have difference from those we evaluated.
 #### Benchmark Results for Mandarin Model (Character Error Rate)
 Test Set                | Aishell Model     | BaiduCN1.2k Model
--- a/data/voxforge/run_data.sh
+++ b/data/voxforge/run_data.sh
@ -0,0 +1,18 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download data, generate manifests
 PYTHONPATH=.:$PYTHONPATH python data/voxforge/voxforge.py \
 --manifest_prefix='data/voxforge/manifest' \
 --target_dir='~/.cache/paddle/dataset/speech/VoxForge' \
 --is_merge_dialect=True \
 --dialects 'american' 'british' 'australian' 'european' 'irish' 'canadian' 'indian'
 if [ $? -ne 0 ]; then
    echo "Prepare VoxForge failed. Terminated."
    exit 1
 fi
 echo "VoxForge Data preparation done."
 exit 0
--- a/data/voxforge/voxforge.py
+++ b/data/voxforge/voxforge.py
@ -0,0 +1,221 @@
 """Prepare VoxForge dataset
 Download, unpack and create manifest files.
 Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os
 import codecs
 import soundfile
 import json
 import argparse
 import shutil
 import subprocess
 from data_utils.utility import download_multi, unpack, getfile_insensitive
 DATA_HOME = '~/.cache/paddle/dataset/speech'
 DATA_URL = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/' \
           'Audio/Main/16kHz_16bit'
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/VoxForge",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
    "--dialects",
    default=[
        'american', 'british', 'australian', 'european', 'irish', 'canadian',
        'indian'
    ],
    nargs='+',
    type=str,
    help="Dialect types. (default: %(default)s)")
 parser.add_argument(
    "--is_merge_dialect",
    default=True,
    type=bool,
    help="If set True, manifests of american dialect and canadian dialect will "
    "be merged to american-canadian dialect; manifests of british "
    "dialect, irish dialect and australian dialect will be merged to "
    "commonwealth dialect. (default: %(default)s)")
 parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
 args = parser.parse_args()
 def download_and_unpack(target_dir, url):
    wget_args = '-q -l 1 -N -nd -c -e robots=off -A tgz -r -np'
    tgz_dir = os.path.join(target_dir, 'tgz')
    exit_code = download_multi(url, tgz_dir, wget_args)
    if exit_code != 0:
        print('Download tgz audio files failed with exit code %d.' % exit_code)
    else:
        print('Download done, start unpacking ...')
        audio_dir = os.path.join(target_dir, 'audio')
        for root, dirs, files in os.walk(tgz_dir):
            for file in files:
                print(file)
                if file.endswith('.tgz'):
                    unpack(os.path.join(root, file), audio_dir)
 def select_dialects(target_dir, dialect_list):
    """Classify audio files by dialect."""
    dialect_root_dir = os.path.join(target_dir, 'dialect')
    if os.path.exists(dialect_root_dir):
        shutil.rmtree(dialect_root_dir)
    os.mkdir(dialect_root_dir)
    audio_dir = os.path.abspath(os.path.join(target_dir, 'audio'))
    for dialect in dialect_list:
        # filter files by dialect
        command = 'find %s -iwholename "*etc/readme*" -exec egrep -iHl \
                   "pronunciation dialect.*%s" {} \;' % (audio_dir, dialect)
        p = subprocess.Popen(
            command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
        output, err = p.communicate()
        dialect_dir = os.path.join(dialect_root_dir, dialect)
        if os.path.exists(dialect_dir):
            shutil.rmtree(dialect_dir)
        os.mkdir(dialect_dir)
        for path in output.splitlines():
            src_dir = os.path.dirname(os.path.dirname(path))
            link = os.path.basename(os.path.normpath(src_dir))
            os.symlink(src_dir, os.path.join(dialect_dir, link))
 def generate_manifest(data_dir, manifest_path):
    json_lines = []
    for path in os.listdir(data_dir):
        audio_link = os.path.join(data_dir, path)
        assert os.path.islink(
            audio_link), '%s should be symbolic link.' % audio_link
        actual_audio_dir = os.path.abspath(os.readlink(audio_link))
        audio_type = ''
        if os.path.isdir(os.path.join(actual_audio_dir, 'wav')):
            audio_type = 'wav'
        elif os.path.isdir(os.path.join(actual_audio_dir, 'flac')):
            audio_type = 'flac'
        else:
            print('Unknown audio type, skipped processing %s.' %
                  actual_audio_dir)
            continue
        etc_dir = os.path.join(actual_audio_dir, 'etc')
        prompts_file = os.path.join(etc_dir, 'PROMPTS')
        if not os.path.isfile(prompts_file):
            print('PROMPTS file missing, skip processing %s.' %
                  actual_audio_dir)
            continue
        readme_file = getfile_insensitive(os.path.join(etc_dir, 'README'))
        if readme_file is None:
            print('README file missing, skip processing %s.' % actual_audio_dir)
            continue
        for line in file(prompts_file):
            u, trans = line.strip().split(None, 1)
            u_parts = u.split('/')
            # try to format the date time
            try:
                speaker, date, sfx = u_parts[-3].split('-')
                obj = datetime.datetime.strptime(date, '%y.%m.%d')
                formatted = obj.strftime('%Y%m%d')
                u_parts[-3] = '-'.join([speaker, formatted, sfx])
            except Exception as e:
                pass
            if len(u_parts) < 2:
                u_parts = [audio_type] + u_parts
            u_parts[-2] = audio_type
            u_parts[-1] += '.' + audio_type
            u = os.path.join(actual_audio_dir, '/'.join(u_parts[-2:]))
            if not os.path.isfile(u):
                print('Audio file missing, skip processing %s.' % u)
                continue
            if os.stat(u).st_size == 0:
                print('Empty audio file, skip processing %s.' % u)
                continue
            trans = trans.strip().replace('-', ' ')
            if not trans.isupper() or \
                not trans.strip().replace(' ', '').replace("'", "").isalpha():
                print("Transcript not normalized properly, skip processing %s."
                      % u)
                continue
            audio_data, samplerate = soundfile.read(u)
            duration = float(len(audio_data)) / samplerate
            json_lines.append(
                json.dumps({
                    'audio_filepath': u,
                    'duration': duration,
                    'text': trans.lower()
                }))
    with codecs.open(manifest_path, 'w', 'utf-8') as fout:
        for line in json_lines:
            fout.write(line + '\n')
 def merge_manifests(manifest_files, save_path):
    lines = []
    for manifest_file in manifest_files:
        line = codecs.open(manifest_file, 'r', 'utf-8').readlines()
        lines += line
    with codecs.open(save_path, 'w', 'utf-8') as fout:
        for line in lines:
            fout.write(line)
 def prepare_dataset(url, dialects, target_dir, manifest_prefix, is_merge):
    download_and_unpack(target_dir, url)
    select_dialects(target_dir, dialects)
    american_canadian_manifests = []
    commonwealth_manifests = []
    for dialect in dialects:
        dialect_dir = os.path.join(target_dir, 'dialect', dialect)
        manifest_fpath = manifest_prefix + '.' + dialect
        if dialect == 'american' or dialect == 'canadian':
            american_canadian_manifests.append(manifest_fpath)
        if dialect == 'australian' \
                or dialect == 'british' \
                or dialect == 'irish':
            commonwealth_manifests.append(manifest_fpath)
        generate_manifest(dialect_dir, manifest_fpath)
    if is_merge:
        if len(american_canadian_manifests) > 0:
            manifest_fpath = manifest_prefix + '.american-canadian'
            merge_manifests(american_canadian_manifests, manifest_fpath)
        if len(commonwealth_manifests) > 0:
            manifest_fpath = manifest_prefix + '.commonwealth'
            merge_manifests(commonwealth_manifests, manifest_fpath)
 def main():
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(DATA_URL, args.dialects, args.target_dir,
                    args.manifest_prefix, args.is_merge_dialect)
 if __name__ == '__main__':
    main()
--- a/data_utils/utility.py
+++ b/data_utils/utility.py
@ -42,6 +42,25 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
    return manifest
 def getfile_insensitive(path):
    """Get the actual file path when given insensitive filename."""
    directory, filename = os.path.split(path)
    directory, filename = (directory or '.'), filename.lower()
    for f in os.listdir(directory):
        newpath = os.path.join(directory, f)
        if os.path.isfile(newpath) and f.lower() == filename:
            return newpath
 def download_multi(url, target_dir, extra_args):
    """Download multiple files from url to target_dir."""
    if not os.path.exists(target_dir): os.makedirs(target_dir)
    print("Downloading %s ..." % url)
    ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " +
                         target_dir)
    return ret_code
 def download(url, md5sum, target_dir):
    """Download file from url to target_dir, and check md5sum."""
    if not os.path.exists(target_dir): os.makedirs(target_dir)