Merge pull request #702 from PaddlePaddle/dataset

add timit dataset
4 years ago · fe0c3bb92d
parent 0296639d2d 8998f4c24d
commit fe0c3bb92d
5 changed files with 271 additions and 4 deletions
--- a/examples/dataset/gigaspeech/run.sh
+++ b/examples/dataset/gigaspeech/run.sh
@ -5,6 +5,10 @@ set -e
 curdir=$PWD

 test -d GigaSpeech || git clone https://github.com/SpeechColab/GigaSpeech.git
-cd GigaSpeech
+
+
+pushd GigaSpeech
 source env_vars.sh
-utils/gigaspeech_download.sh ${curdir}/
+./utils/download_gigaspeech.sh ${curdir}/
+#toolkits/kaldi/gigaspeech_data_prep.sh --train-subset XL /disk1/audio_data/gigaspeech ../data
+popd
--- a/examples/dataset/thchs30/thchs30.py
+++ b/examples/dataset/thchs30/thchs30.py
@ -117,20 +117,21 @@ def create_manifest(data_dir, manifest_path_prefix):
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)

+                # not dump alignment infos
                json_lines.append(
                    json.dumps(
                        {
                            'utt': audio_id,
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
-                            'text': word_text,
+                            'text': word_text,  # charactor
                            'syllable': syllable_text,
                            'phone': phone_text,
                        },
                        ensure_ascii=False))

                total_sec += duration
-                total_text += len(text)
+                total_text += len(word_text)
                total_num += 1

        manifest_path = manifest_path_prefix + '.' + dtype
--- a/examples/dataset/timit/.gitignore
+++ b/examples/dataset/timit/.gitignore
@ -0,0 +1,4 @@
+TIMIT.*
+TIMIT
+manifest.*
+*.meta
--- a/examples/dataset/timit/timit.py
+++ b/examples/dataset/timit/timit.py
@ -0,0 +1,239 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare Librispeech ASR datasets.
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import json
+import os
+import re
+import string
+from pathlib import Path
+
+import soundfile
+
+from utils.utility import unzip
+
+URL_ROOT = ""
+MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d"
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default='~/.cache/paddle/dataset/speech/timit',
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+#: A string containing Chinese punctuation marks (non-stops).
+non_stops = (
+    # Fullwidth ASCII variants
+    '\uFF02\uFF03\uFF04\uFF05\uFF06\uFF07\uFF08\uFF09\uFF0A\uFF0B\uFF0C\uFF0D'
+    '\uFF0F\uFF1A\uFF1B\uFF1C\uFF1D\uFF1E\uFF20\uFF3B\uFF3C\uFF3D\uFF3E\uFF3F'
+    '\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF60'
+
+    # Halfwidth CJK punctuation
+    '\uFF62\uFF63\uFF64'
+
+    # CJK symbols and punctuation
+    '\u3000\u3001\u3003'
+
+    # CJK angle and corner brackets
+    '\u3008\u3009\u300A\u300B\u300C\u300D\u300E\u300F\u3010\u3011'
+
+    # CJK brackets and symbols/punctuation
+    '\u3014\u3015\u3016\u3017\u3018\u3019\u301A\u301B\u301C\u301D\u301E\u301F'
+
+    # Other CJK symbols
+    '\u3030'
+
+    # Special CJK indicators
+    '\u303E\u303F'
+
+    # Dashes
+    '\u2013\u2014'
+
+    # Quotation marks and apostrophe
+    '\u2018\u2019\u201B\u201C\u201D\u201E\u201F'
+
+    # General punctuation
+    '\u2026\u2027'
+
+    # Overscores and underscores
+    '\uFE4F'
+
+    # Small form variants
+    '\uFE51\uFE54'
+
+    # Latin punctuation
+    '\u00B7')
+
+#: A string of Chinese stops.
+stops = (
+    '\uFF01'  # Fullwidth exclamation mark
+    '\uFF1F'  # Fullwidth question mark
+    '\uFF61'  # Halfwidth ideographic full stop
+    '\u3002'  # Ideographic full stop
+)
+
+#: A string containing all Chinese punctuation.
+punctuation = non_stops + stops
+
+
+def tn(text):
+    # lower text
+    text = text.lower()
+    # remove punc
+    text = re.sub(f'[{punctuation}{string.punctuation}]', "", text)
+    return text
+
+
+def read_txt(filepath: str) -> str:
+    with open(filepath, 'r') as f:
+        line = f.read().strip().split(maxsplit=2)[2]
+        return tn(line)
+
+
+def read_algin(filepath: str) -> str:
+    """read word or phone alignment file.
+    <start-sample> <end-sample> <token><newline>
+    
+    Args:
+        filepath (str): [description]
+
+    Returns:
+        str: token sepearte by <space>
+    """
+    aligns = []  # (start, end, token)
+    with open(filepath, 'r') as f:
+        for line in f:
+            items = line.strip().split()
+            # for phone: (Note: beginning and ending silence regions are marked with h#)
+            if items[2].strip() == 'h#':
+                continue
+            aligns.append(items)
+    return ' '.join([item[2] for item in aligns])
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    """Create a manifest json file summarizing the data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file within the data set.
+    """
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    utts = set()
+
+    data_types = ['TRAIN', 'TEST']
+    for dtype in data_types:
+        del json_lines[:]
+        total_sec = 0.0
+        total_text = 0.0
+        total_num = 0
+
+        audio_dir = Path(os.path.join(data_dir, dtype))
+        for fname in sorted(audio_dir.rglob('*.WAV')):
+            audio_path = fname.resolve()  # .WAV
+            audio_id = audio_path.stem
+            # if uttid exits,  then skipped
+            if audio_id in utts:
+                continue
+
+            utts.add(audio_id)
+            text_path = audio_path.with_suffix('.TXT')
+            phone_path = audio_path.with_suffix('.PHN')
+            word_path = audio_path.with_suffix('.WRD')
+
+            audio_data, samplerate = soundfile.read(
+                str(audio_path), dtype='int16')
+            duration = float(len(audio_data) / samplerate)
+            word_text = read_txt(text_path)
+            phone_text = read_algin(phone_path)
+
+            gender_spk = str(audio_path.parent.stem)
+            spk = gender_spk[1:]
+            gender = gender_spk[0]
+            utt_id = '_'.join([spk, gender, audio_id])
+            # not dump alignment infos
+            json_lines.append(
+                json.dumps(
+                    {
+                        'utt': utt_id,
+                        'feat': str(audio_path),
+                        'feat_shape': (duration, ),  # second
+                        'text': word_text,  # word
+                        'phone': phone_text,
+                        'spk': spk,
+                        'gender': gender,
+                    },
+                    ensure_ascii=False))
+
+            total_sec += duration
+            total_text += len(word_text.split())
+            total_num += 1
+
+        manifest_path = manifest_path_prefix + '.' + dtype.lower()
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+
+        with open(dtype.lower() + '.meta', 'w') as f:
+            print(f"{dtype}:", file=f)
+            print(f"{total_num} utts", file=f)
+            print(f"{total_sec / (60*60)} h", file=f)
+            print(f"{total_text} text", file=f)
+            print(f"{total_text / total_sec} text/sec", file=f)
+            print(f"{total_sec / total_num} sec/utt", file=f)
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
+    """Download, unpack and create summmary manifest file.
+    """
+    filepath = os.path.join(target_dir, "TIMIT.zip")
+    if not os.path.exists(filepath):
+        print(f"Please download TIMIT.zip into {target_dir}.")
+        raise FileNotFoundError
+
+    if not os.path.exists(os.path.join(target_dir, "TIMIT")):
+        # check md5sum
+        assert check_md5sum(filepath, md5sum)
+        # unpack
+        unzip(filepath, target_dir)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    # create manifest json file
+    create_manifest(os.path.join(target_dir, "TIMIT"), manifest_path)
+
+
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    prepare_dataset(URL_ROOT, MD5_DATA, args.target_dir, args.manifest_prefix)
+    print("Data download and manifest prepare done!")
+
+
+if __name__ == '__main__':
+    main()
--- a/utils/utility.py
+++ b/utils/utility.py
@ -14,9 +14,15 @@
 import os
 import tarfile
 import zipfile
+from typing import Text

 from paddle.dataset.common import md5file

+__all__ = [
+    "check_md5sum", "getfile_insensitive", "download_multi", "download",
+    "unpack", "unzip"
+]
+

 def getfile_insensitive(path):
    """Get the actual file path when given insensitive filename."""
@ -54,6 +60,19 @@ def download(url, md5sum, target_dir):
    return filepath


+def check_md5sum(filepath: Text, md5sum: Text) -> bool:
+    """check md5sum of file.
+
+    Args:
+        filepath (Text): [description]
+        md5sum (Text): [description]
+
+    Returns:
+        bool: same or not.
+    """
+    return md5file(filepath) == md5sum
+
+
 def unpack(filepath, target_dir, rm_tar=False):
    """Unpack the file to the target_dir."""
    print("Unpacking %s ..." % filepath)