From abbfa43b22d19b990df9a239fee5a4fbdd06b996 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Thu, 16 Nov 2017 23:04:35 +0800
Subject: [PATCH] Add script for VoxForge data preparation.

---
 README.md                 |   2 +
 data/voxforge/run_data.sh |  18 ++++
 data/voxforge/voxforge.py | 221 ++++++++++++++++++++++++++++++++++++++
 data_utils/utility.py     |  19 ++++
 4 files changed, 260 insertions(+)
 create mode 100644 data/voxforge/run_data.sh
 create mode 100644 data/voxforge/voxforge.py

diff --git a/README.md b/README.md
index ca146926..6f282a28 100644
--- a/README.md
+++ b/README.md
@@ -506,6 +506,8 @@ VoxForge European       |   31.21           |   20.47
 VoxForge Indian         |   56.79           |   28.15
 Baidu Internal Testset  |   47.73           |   8.92
 
+For reproducing results on VoxForge data, we provide a script to generate VoxForge dialect manifest files. Please go to ```data/voxforge``` and execute ```sh run_data.sh``` to get VoxForge dialect manifest files. Notice that VoxForge data may keep updated and the generated manifest files may have difference from those we evaluated.
+
 #### Benchmark Results for Mandarin Model (Character Error Rate)
 
 Test Set                | Aishell Model     | BaiduCN1.2k Model
diff --git a/data/voxforge/run_data.sh b/data/voxforge/run_data.sh
new file mode 100644
index 00000000..e0a9f1b3
--- /dev/null
+++ b/data/voxforge/run_data.sh
@@ -0,0 +1,18 @@
+#! /usr/bin/env bash
+
+cd ../.. > /dev/null
+
+# download data, generate manifests
+PYTHONPATH=.:$PYTHONPATH python data/voxforge/voxforge.py \
+--manifest_prefix='data/voxforge/manifest' \
+--target_dir='~/.cache/paddle/dataset/speech/VoxForge' \
+--is_merge_dialect=True \
+--dialects 'american' 'british' 'australian' 'european' 'irish' 'canadian' 'indian'
+
+if [ $? -ne 0 ]; then
+    echo "Prepare VoxForge failed. Terminated."
+    exit 1
+fi
+
+echo "VoxForge Data preparation done."
+exit 0
diff --git a/data/voxforge/voxforge.py b/data/voxforge/voxforge.py
new file mode 100644
index 00000000..63f052bd
--- /dev/null
+++ b/data/voxforge/voxforge.py
@@ -0,0 +1,221 @@
+"""Prepare VoxForge dataset
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import codecs
+import soundfile
+import json
+import argparse
+import shutil
+import subprocess
+from data_utils.utility import download_multi, unpack, getfile_insensitive
+
+DATA_HOME = '~/.cache/paddle/dataset/speech'
+
+DATA_URL = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/' \
+           'Audio/Main/16kHz_16bit'
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/VoxForge",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--dialects",
+    default=[
+        'american', 'british', 'australian', 'european', 'irish', 'canadian',
+        'indian'
+    ],
+    nargs='+',
+    type=str,
+    help="Dialect types. (default: %(default)s)")
+parser.add_argument(
+    "--is_merge_dialect",
+    default=True,
+    type=bool,
+    help="If set True, manifests of american dialect and canadian dialect will "
+    "be merged to american-canadian dialect; manifests of british "
+    "dialect, irish dialect and australian dialect will be merged to "
+    "commonwealth dialect. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def download_and_unpack(target_dir, url):
+    wget_args = '-q -l 1 -N -nd -c -e robots=off -A tgz -r -np'
+    tgz_dir = os.path.join(target_dir, 'tgz')
+    exit_code = download_multi(url, tgz_dir, wget_args)
+    if exit_code != 0:
+        print('Download tgz audio files failed with exit code %d.' % exit_code)
+    else:
+        print('Download done, start unpacking ...')
+        audio_dir = os.path.join(target_dir, 'audio')
+        for root, dirs, files in os.walk(tgz_dir):
+            for file in files:
+                print(file)
+                if file.endswith('.tgz'):
+                    unpack(os.path.join(root, file), audio_dir)
+
+
+def select_dialects(target_dir, dialect_list):
+    """Classify audio files by dialect."""
+    dialect_root_dir = os.path.join(target_dir, 'dialect')
+    if os.path.exists(dialect_root_dir):
+        shutil.rmtree(dialect_root_dir)
+    os.mkdir(dialect_root_dir)
+    audio_dir = os.path.abspath(os.path.join(target_dir, 'audio'))
+    for dialect in dialect_list:
+        # filter files by dialect
+        command = 'find %s -iwholename "*etc/readme*" -exec egrep -iHl \
+                   "pronunciation dialect.*%s" {} \;' % (audio_dir, dialect)
+        p = subprocess.Popen(
+            command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
+        output, err = p.communicate()
+        dialect_dir = os.path.join(dialect_root_dir, dialect)
+        if os.path.exists(dialect_dir):
+            shutil.rmtree(dialect_dir)
+        os.mkdir(dialect_dir)
+        for path in output.splitlines():
+            src_dir = os.path.dirname(os.path.dirname(path))
+            link = os.path.basename(os.path.normpath(src_dir))
+            os.symlink(src_dir, os.path.join(dialect_dir, link))
+
+
+def generate_manifest(data_dir, manifest_path):
+    json_lines = []
+
+    for path in os.listdir(data_dir):
+        audio_link = os.path.join(data_dir, path)
+        assert os.path.islink(
+            audio_link), '%s should be symbolic link.' % audio_link
+        actual_audio_dir = os.path.abspath(os.readlink(audio_link))
+
+        audio_type = ''
+        if os.path.isdir(os.path.join(actual_audio_dir, 'wav')):
+            audio_type = 'wav'
+        elif os.path.isdir(os.path.join(actual_audio_dir, 'flac')):
+            audio_type = 'flac'
+        else:
+            print('Unknown audio type, skipped processing %s.' %
+                  actual_audio_dir)
+            continue
+
+        etc_dir = os.path.join(actual_audio_dir, 'etc')
+        prompts_file = os.path.join(etc_dir, 'PROMPTS')
+        if not os.path.isfile(prompts_file):
+            print('PROMPTS file missing, skip processing %s.' %
+                  actual_audio_dir)
+            continue
+
+        readme_file = getfile_insensitive(os.path.join(etc_dir, 'README'))
+        if readme_file is None:
+            print('README file missing, skip processing %s.' % actual_audio_dir)
+            continue
+
+        for line in file(prompts_file):
+            u, trans = line.strip().split(None, 1)
+            u_parts = u.split('/')
+
+            # try to format the date time
+            try:
+                speaker, date, sfx = u_parts[-3].split('-')
+                obj = datetime.datetime.strptime(date, '%y.%m.%d')
+                formatted = obj.strftime('%Y%m%d')
+                u_parts[-3] = '-'.join([speaker, formatted, sfx])
+            except Exception as e:
+                pass
+
+            if len(u_parts) < 2:
+                u_parts = [audio_type] + u_parts
+            u_parts[-2] = audio_type
+            u_parts[-1] += '.' + audio_type
+            u = os.path.join(actual_audio_dir, '/'.join(u_parts[-2:]))
+
+            if not os.path.isfile(u):
+                print('Audio file missing, skip processing %s.' % u)
+                continue
+
+            if os.stat(u).st_size == 0:
+                print('Empty audio file, skip processing %s.' % u)
+                continue
+
+            trans = trans.strip().replace('-', ' ')
+            if not trans.isupper() or \
+                not trans.strip().replace(' ', '').replace("'", "").isalpha():
+                print("Transcript not normalized properly, skip processing %s."
+                      % u)
+                continue
+
+            audio_data, samplerate = soundfile.read(u)
+            duration = float(len(audio_data)) / samplerate
+            json_lines.append(
+                json.dumps({
+                    'audio_filepath': u,
+                    'duration': duration,
+                    'text': trans.lower()
+                }))
+
+    with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+        for line in json_lines:
+            fout.write(line + '\n')
+
+
+def merge_manifests(manifest_files, save_path):
+    lines = []
+    for manifest_file in manifest_files:
+        line = codecs.open(manifest_file, 'r', 'utf-8').readlines()
+        lines += line
+
+    with codecs.open(save_path, 'w', 'utf-8') as fout:
+        for line in lines:
+            fout.write(line)
+
+
+def prepare_dataset(url, dialects, target_dir, manifest_prefix, is_merge):
+    download_and_unpack(target_dir, url)
+    select_dialects(target_dir, dialects)
+    american_canadian_manifests = []
+    commonwealth_manifests = []
+    for dialect in dialects:
+        dialect_dir = os.path.join(target_dir, 'dialect', dialect)
+        manifest_fpath = manifest_prefix + '.' + dialect
+        if dialect == 'american' or dialect == 'canadian':
+            american_canadian_manifests.append(manifest_fpath)
+        if dialect == 'australian' \
+                or dialect == 'british' \
+                or dialect == 'irish':
+            commonwealth_manifests.append(manifest_fpath)
+        generate_manifest(dialect_dir, manifest_fpath)
+
+    if is_merge:
+        if len(american_canadian_manifests) > 0:
+            manifest_fpath = manifest_prefix + '.american-canadian'
+            merge_manifests(american_canadian_manifests, manifest_fpath)
+        if len(commonwealth_manifests) > 0:
+            manifest_fpath = manifest_prefix + '.commonwealth'
+            merge_manifests(commonwealth_manifests, manifest_fpath)
+
+
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    prepare_dataset(DATA_URL, args.dialects, args.target_dir,
+                    args.manifest_prefix, args.is_merge_dialect)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/data_utils/utility.py b/data_utils/utility.py
index bb5cad45..2633e1b4 100644
--- a/data_utils/utility.py
+++ b/data_utils/utility.py
@@ -42,6 +42,25 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
     return manifest
 
 
+def getfile_insensitive(path):
+    """Get the actual file path when given insensitive filename."""
+    directory, filename = os.path.split(path)
+    directory, filename = (directory or '.'), filename.lower()
+    for f in os.listdir(directory):
+        newpath = os.path.join(directory, f)
+        if os.path.isfile(newpath) and f.lower() == filename:
+            return newpath
+
+
+def download_multi(url, target_dir, extra_args):
+    """Download multiple files from url to target_dir."""
+    if not os.path.exists(target_dir): os.makedirs(target_dir)
+    print("Downloading %s ..." % url)
+    ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " +
+                         target_dir)
+    return ret_code
+
+
 def download(url, md5sum, target_dir):
     """Download file from url to target_dir, and check md5sum."""
     if not os.path.exists(target_dir): os.makedirs(target_dir)