From abbfa43b22d19b990df9a239fee5a4fbdd06b996 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Thu, 16 Nov 2017 23:04:35 +0800 Subject: [PATCH] Add script for VoxForge data preparation. --- README.md | 2 + data/voxforge/run_data.sh | 18 ++++ data/voxforge/voxforge.py | 221 ++++++++++++++++++++++++++++++++++++++ data_utils/utility.py | 19 ++++ 4 files changed, 260 insertions(+) create mode 100644 data/voxforge/run_data.sh create mode 100644 data/voxforge/voxforge.py diff --git a/README.md b/README.md index ca146926..6f282a28 100644 --- a/README.md +++ b/README.md @@ -506,6 +506,8 @@ VoxForge European | 31.21 | 20.47 VoxForge Indian | 56.79 | 28.15 Baidu Internal Testset  |   47.73 |   8.92 +For reproducing results on VoxForge data, we provide a script to generate VoxForge dialect manifest files. Please go to ```data/voxforge``` and execute ```sh run_data.sh``` to get VoxForge dialect manifest files. Notice that VoxForge data may keep updated and the generated manifest files may have difference from those we evaluated. + #### Benchmark Results for Mandarin Model (Character Error Rate) Test Set | Aishell Model | BaiduCN1.2k Model diff --git a/data/voxforge/run_data.sh b/data/voxforge/run_data.sh new file mode 100644 index 00000000..e0a9f1b3 --- /dev/null +++ b/data/voxforge/run_data.sh @@ -0,0 +1,18 @@ +#! /usr/bin/env bash + +cd ../.. > /dev/null + +# download data, generate manifests +PYTHONPATH=.:$PYTHONPATH python data/voxforge/voxforge.py \ +--manifest_prefix='data/voxforge/manifest' \ +--target_dir='~/.cache/paddle/dataset/speech/VoxForge' \ +--is_merge_dialect=True \ +--dialects 'american' 'british' 'australian' 'european' 'irish' 'canadian' 'indian' + +if [ $? -ne 0 ]; then + echo "Prepare VoxForge failed. Terminated." + exit 1 +fi + +echo "VoxForge Data preparation done." +exit 0 diff --git a/data/voxforge/voxforge.py b/data/voxforge/voxforge.py new file mode 100644 index 00000000..63f052bd --- /dev/null +++ b/data/voxforge/voxforge.py @@ -0,0 +1,221 @@ +"""Prepare VoxForge dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import codecs +import soundfile +import json +import argparse +import shutil +import subprocess +from data_utils.utility import download_multi, unpack, getfile_insensitive + +DATA_HOME = '~/.cache/paddle/dataset/speech' + +DATA_URL = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/' \ + 'Audio/Main/16kHz_16bit' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/VoxForge", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--dialects", + default=[ + 'american', 'british', 'australian', 'european', 'irish', 'canadian', + 'indian' + ], + nargs='+', + type=str, + help="Dialect types. (default: %(default)s)") +parser.add_argument( + "--is_merge_dialect", + default=True, + type=bool, + help="If set True, manifests of american dialect and canadian dialect will " + "be merged to american-canadian dialect; manifests of british " + "dialect, irish dialect and australian dialect will be merged to " + "commonwealth dialect. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def download_and_unpack(target_dir, url): + wget_args = '-q -l 1 -N -nd -c -e robots=off -A tgz -r -np' + tgz_dir = os.path.join(target_dir, 'tgz') + exit_code = download_multi(url, tgz_dir, wget_args) + if exit_code != 0: + print('Download tgz audio files failed with exit code %d.' % exit_code) + else: + print('Download done, start unpacking ...') + audio_dir = os.path.join(target_dir, 'audio') + for root, dirs, files in os.walk(tgz_dir): + for file in files: + print(file) + if file.endswith('.tgz'): + unpack(os.path.join(root, file), audio_dir) + + +def select_dialects(target_dir, dialect_list): + """Classify audio files by dialect.""" + dialect_root_dir = os.path.join(target_dir, 'dialect') + if os.path.exists(dialect_root_dir): + shutil.rmtree(dialect_root_dir) + os.mkdir(dialect_root_dir) + audio_dir = os.path.abspath(os.path.join(target_dir, 'audio')) + for dialect in dialect_list: + # filter files by dialect + command = 'find %s -iwholename "*etc/readme*" -exec egrep -iHl \ + "pronunciation dialect.*%s" {} \;' % (audio_dir, dialect) + p = subprocess.Popen( + command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) + output, err = p.communicate() + dialect_dir = os.path.join(dialect_root_dir, dialect) + if os.path.exists(dialect_dir): + shutil.rmtree(dialect_dir) + os.mkdir(dialect_dir) + for path in output.splitlines(): + src_dir = os.path.dirname(os.path.dirname(path)) + link = os.path.basename(os.path.normpath(src_dir)) + os.symlink(src_dir, os.path.join(dialect_dir, link)) + + +def generate_manifest(data_dir, manifest_path): + json_lines = [] + + for path in os.listdir(data_dir): + audio_link = os.path.join(data_dir, path) + assert os.path.islink( + audio_link), '%s should be symbolic link.' % audio_link + actual_audio_dir = os.path.abspath(os.readlink(audio_link)) + + audio_type = '' + if os.path.isdir(os.path.join(actual_audio_dir, 'wav')): + audio_type = 'wav' + elif os.path.isdir(os.path.join(actual_audio_dir, 'flac')): + audio_type = 'flac' + else: + print('Unknown audio type, skipped processing %s.' % + actual_audio_dir) + continue + + etc_dir = os.path.join(actual_audio_dir, 'etc') + prompts_file = os.path.join(etc_dir, 'PROMPTS') + if not os.path.isfile(prompts_file): + print('PROMPTS file missing, skip processing %s.' % + actual_audio_dir) + continue + + readme_file = getfile_insensitive(os.path.join(etc_dir, 'README')) + if readme_file is None: + print('README file missing, skip processing %s.' % actual_audio_dir) + continue + + for line in file(prompts_file): + u, trans = line.strip().split(None, 1) + u_parts = u.split('/') + + # try to format the date time + try: + speaker, date, sfx = u_parts[-3].split('-') + obj = datetime.datetime.strptime(date, '%y.%m.%d') + formatted = obj.strftime('%Y%m%d') + u_parts[-3] = '-'.join([speaker, formatted, sfx]) + except Exception as e: + pass + + if len(u_parts) < 2: + u_parts = [audio_type] + u_parts + u_parts[-2] = audio_type + u_parts[-1] += '.' + audio_type + u = os.path.join(actual_audio_dir, '/'.join(u_parts[-2:])) + + if not os.path.isfile(u): + print('Audio file missing, skip processing %s.' % u) + continue + + if os.stat(u).st_size == 0: + print('Empty audio file, skip processing %s.' % u) + continue + + trans = trans.strip().replace('-', ' ') + if not trans.isupper() or \ + not trans.strip().replace(' ', '').replace("'", "").isalpha(): + print("Transcript not normalized properly, skip processing %s." + % u) + continue + + audio_data, samplerate = soundfile.read(u) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps({ + 'audio_filepath': u, + 'duration': duration, + 'text': trans.lower() + })) + + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + +def merge_manifests(manifest_files, save_path): + lines = [] + for manifest_file in manifest_files: + line = codecs.open(manifest_file, 'r', 'utf-8').readlines() + lines += line + + with codecs.open(save_path, 'w', 'utf-8') as fout: + for line in lines: + fout.write(line) + + +def prepare_dataset(url, dialects, target_dir, manifest_prefix, is_merge): + download_and_unpack(target_dir, url) + select_dialects(target_dir, dialects) + american_canadian_manifests = [] + commonwealth_manifests = [] + for dialect in dialects: + dialect_dir = os.path.join(target_dir, 'dialect', dialect) + manifest_fpath = manifest_prefix + '.' + dialect + if dialect == 'american' or dialect == 'canadian': + american_canadian_manifests.append(manifest_fpath) + if dialect == 'australian' \ + or dialect == 'british' \ + or dialect == 'irish': + commonwealth_manifests.append(manifest_fpath) + generate_manifest(dialect_dir, manifest_fpath) + + if is_merge: + if len(american_canadian_manifests) > 0: + manifest_fpath = manifest_prefix + '.american-canadian' + merge_manifests(american_canadian_manifests, manifest_fpath) + if len(commonwealth_manifests) > 0: + manifest_fpath = manifest_prefix + '.commonwealth' + merge_manifests(commonwealth_manifests, manifest_fpath) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset(DATA_URL, args.dialects, args.target_dir, + args.manifest_prefix, args.is_merge_dialect) + + +if __name__ == '__main__': + main() diff --git a/data_utils/utility.py b/data_utils/utility.py index bb5cad45..2633e1b4 100644 --- a/data_utils/utility.py +++ b/data_utils/utility.py @@ -42,6 +42,25 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): return manifest +def getfile_insensitive(path): + """Get the actual file path when given insensitive filename.""" + directory, filename = os.path.split(path) + directory, filename = (directory or '.'), filename.lower() + for f in os.listdir(directory): + newpath = os.path.join(directory, f) + if os.path.isfile(newpath) and f.lower() == filename: + return newpath + + +def download_multi(url, target_dir, extra_args): + """Download multiple files from url to target_dir.""" + if not os.path.exists(target_dir): os.makedirs(target_dir) + print("Downloading %s ..." % url) + ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " + + target_dir) + return ret_code + + def download(url, md5sum, target_dir): """Download file from url to target_dir, and check md5sum.""" if not os.path.exists(target_dir): os.makedirs(target_dir)