"""Prepare VoxForge dataset Download, unpack and create manifest files. Manifest file is a json-format file with each line containing the meta data (i.e. audio filepath, transcript and audio duration) of each audio file in the data set. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import codecs import soundfile import json import argparse import shutil import subprocess from data_utils.utility import download_multi, unpack, getfile_insensitive DATA_HOME = './dataset' DATA_URL = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/' \ 'Audio/Main/16kHz_16bit' parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--target_dir", default=DATA_HOME + "/VoxForge", type=str, help="Directory to save the dataset. (default: %(default)s)") parser.add_argument( "--dialects", default=[ 'american', 'british', 'australian', 'european', 'irish', 'canadian', 'indian' ], nargs='+', type=str, help="Dialect types. (default: %(default)s)") parser.add_argument( "--is_merge_dialect", default=True, type=bool, help="If set True, manifests of american dialect and canadian dialect will " "be merged to american-canadian dialect; manifests of british " "dialect, irish dialect and australian dialect will be merged to " "commonwealth dialect. (default: %(default)s)") parser.add_argument( "--manifest_prefix", default="manifest", type=str, help="Filepath prefix for output manifests. (default: %(default)s)") args = parser.parse_args() def download_and_unpack(target_dir, url): wget_args = '-q -l 1 -N -nd -c -e robots=off -A tgz -r -np' tgz_dir = os.path.join(target_dir, 'tgz') exit_code = download_multi(url, tgz_dir, wget_args) if exit_code != 0: print('Download tgz audio files failed with exit code %d.' % exit_code) else: print('Download done, start unpacking ...') audio_dir = os.path.join(target_dir, 'audio') for root, dirs, files in os.walk(tgz_dir): for file in files: print(file) if file.endswith('.tgz'): unpack(os.path.join(root, file), audio_dir) def select_dialects(target_dir, dialect_list): """Classify audio files by dialect.""" dialect_root_dir = os.path.join(target_dir, 'dialect') if os.path.exists(dialect_root_dir): shutil.rmtree(dialect_root_dir) os.mkdir(dialect_root_dir) audio_dir = os.path.abspath(os.path.join(target_dir, 'audio')) for dialect in dialect_list: # filter files by dialect command = 'find %s -iwholename "*etc/readme*" -exec egrep -iHl \ "pronunciation dialect.*%s" {} \;' % (audio_dir, dialect) p = subprocess.Popen( command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) output, err = p.communicate() dialect_dir = os.path.join(dialect_root_dir, dialect) if os.path.exists(dialect_dir): shutil.rmtree(dialect_dir) os.mkdir(dialect_dir) for path in output.splitlines(): src_dir = os.path.dirname(os.path.dirname(path)) link = os.path.basename(os.path.normpath(src_dir)) os.symlink(src_dir, os.path.join(dialect_dir, link)) def generate_manifest(data_dir, manifest_path): json_lines = [] for path in os.listdir(data_dir): audio_link = os.path.join(data_dir, path) assert os.path.islink( audio_link), '%s should be symbolic link.' % audio_link actual_audio_dir = os.path.abspath(os.readlink(audio_link)) audio_type = '' if os.path.isdir(os.path.join(actual_audio_dir, 'wav')): audio_type = 'wav' elif os.path.isdir(os.path.join(actual_audio_dir, 'flac')): audio_type = 'flac' else: print('Unknown audio type, skipped processing %s.' % actual_audio_dir) continue etc_dir = os.path.join(actual_audio_dir, 'etc') prompts_file = os.path.join(etc_dir, 'PROMPTS') if not os.path.isfile(prompts_file): print('PROMPTS file missing, skip processing %s.' % actual_audio_dir) continue readme_file = getfile_insensitive(os.path.join(etc_dir, 'README')) if readme_file is None: print('README file missing, skip processing %s.' % actual_audio_dir) continue for line in file(prompts_file): u, trans = line.strip().split(None, 1) u_parts = u.split('/') # try to format the date time try: speaker, date, sfx = u_parts[-3].split('-') obj = datetime.datetime.strptime(date, '%y.%m.%d') formatted = obj.strftime('%Y%m%d') u_parts[-3] = '-'.join([speaker, formatted, sfx]) except Exception as e: pass if len(u_parts) < 2: u_parts = [audio_type] + u_parts u_parts[-2] = audio_type u_parts[-1] += '.' + audio_type u = os.path.join(actual_audio_dir, '/'.join(u_parts[-2:])) if not os.path.isfile(u): print('Audio file missing, skip processing %s.' % u) continue if os.stat(u).st_size == 0: print('Empty audio file, skip processing %s.' % u) continue trans = trans.strip().replace('-', ' ') if not trans.isupper() or \ not trans.strip().replace(' ', '').replace("'", "").isalpha(): print("Transcript not normalized properly, skip processing %s." % u) continue audio_data, samplerate = soundfile.read(u) duration = float(len(audio_data)) / samplerate json_lines.append( json.dumps({ 'audio_filepath': u, 'duration': duration, 'text': trans.lower() })) with codecs.open(manifest_path, 'w', 'utf-8') as fout: for line in json_lines: fout.write(line + '\n') def merge_manifests(manifest_files, save_path): lines = [] for manifest_file in manifest_files: line = codecs.open(manifest_file, 'r', 'utf-8').readlines() lines += line with codecs.open(save_path, 'w', 'utf-8') as fout: for line in lines: fout.write(line) def prepare_dataset(url, dialects, target_dir, manifest_prefix, is_merge): download_and_unpack(target_dir, url) select_dialects(target_dir, dialects) american_canadian_manifests = [] commonwealth_manifests = [] for dialect in dialects: dialect_dir = os.path.join(target_dir, 'dialect', dialect) manifest_fpath = manifest_prefix + '.' + dialect if dialect == 'american' or dialect == 'canadian': american_canadian_manifests.append(manifest_fpath) if dialect == 'australian' \ or dialect == 'british' \ or dialect == 'irish': commonwealth_manifests.append(manifest_fpath) generate_manifest(dialect_dir, manifest_fpath) if is_merge: if len(american_canadian_manifests) > 0: manifest_fpath = manifest_prefix + '.american-canadian' merge_manifests(american_canadian_manifests, manifest_fpath) if len(commonwealth_manifests) > 0: manifest_fpath = manifest_prefix + '.commonwealth' merge_manifests(commonwealth_manifests, manifest_fpath) def main(): if args.target_dir.startswith('~'): args.target_dir = os.path.expanduser(args.target_dir) prepare_dataset(DATA_URL, args.dialects, args.target_dir, args.manifest_prefix, args.is_merge_dialect) if __name__ == '__main__': main()