Add files via upload

Just copied the folder and pasted it into the data/librispeech to get the librispeech.py to run properly and avoid the error of numpy not seeing the data_utils.utility module to then import download and unpack. It works now and get the download.
5 years ago · a679daece9
parent b3c728d46f
commit a679daece9
4 changed files with 158 additions and 0 deletions
--- a/data/librispeech/data_utils/data.pyc
+++ b/data/librispeech/data_utils/data.pyc
--- a/data/librispeech/data_utils/featurizer/text_featurizer.py
+++ b/data/librispeech/data_utils/featurizer/text_featurizer.py
@ -0,0 +1,68 @@
+"""Contains the text featurizer class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import codecs
+
+
+class TextFeaturizer(object):
+    """Text featurizer, for processing or extracting features from text.
+
+    Currently, it only supports char-level tokenizing and conversion into
+    a list of token indices. Note that the token indexing order follows the
+    given vocabulary file.
+
+    :param vocab_filepath: Filepath to load vocabulary for token indices
+                           conversion.
+    :type specgram_type: basestring
+    """
+
+    def __init__(self, vocab_filepath):
+        self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
+            vocab_filepath)
+
+    def featurize(self, text):
+        """Convert text string to a list of token indices in char-level.Note
+        that the token indexing order follows the given vocabulary file.
+
+        :param text: Text to process.
+        :type text: basestring
+        :return: List of char-level token indices.
+        :rtype: list
+        """
+        tokens = self._char_tokenize(text)
+        return [self._vocab_dict[token] for token in tokens]
+
+    @property
+    def vocab_size(self):
+        """Return the vocabulary size.
+
+        :return: Vocabulary size.
+        :rtype: int
+        """
+        return len(self._vocab_list)
+
+    @property
+    def vocab_list(self):
+        """Return the vocabulary in list.
+
+        :return: Vocabulary in list.
+        :rtype: list
+        """
+        return self._vocab_list
+
+    def _char_tokenize(self, text):
+        """Character tokenizer."""
+        return list(text.strip())
+
+    def _load_vocabulary_from_file(self, vocab_filepath):
+        """Load vocabulary from file."""
+        vocab_lines = []
+        with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
+            vocab_lines.extend(file.readlines())
+        vocab_list = [line[:-1] for line in vocab_lines]
+        vocab_dict = dict(
+            [(token, id) for (id, token) in enumerate(vocab_list)])
+        return vocab_dict, vocab_list
--- a/data/librispeech/data_utils/normalizer.pyc
+++ b/data/librispeech/data_utils/normalizer.pyc
--- a/data/librispeech/data_utils/utility.py
+++ b/data/librispeech/data_utils/utility.py
@ -0,0 +1,90 @@
+"""Contains data helper functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import codecs
+import os
+import tarfile
+import time
+from Queue import Queue
+from threading import Thread
+from multiprocessing import Process, Manager, Value
+from paddle.dataset.common import md5file
+
+
+def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
+    """Load and parse manifest file.
+
+    Instances with durations outside [min_duration, max_duration] will be
+    filtered out.
+
+    :param manifest_path: Manifest file to load and parse.
+    :type manifest_path: basestring
+    :param max_duration: Maximal duration in seconds for instance filter.
+    :type max_duration: float
+    :param min_duration: Minimal duration in seconds for instance filter.
+    :type min_duration: float
+    :return: Manifest parsing results. List of dict.
+    :rtype: list
+    :raises IOError: If failed to parse the manifest.
+    """
+    manifest = []
+    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
+        try:
+            json_data = json.loads(json_line)
+        except Exception as e:
+            raise IOError("Error reading manifest: %s" % str(e))
+        if (json_data["duration"] <= max_duration and
+                json_data["duration"] >= min_duration):
+            manifest.append(json_data)
+    return manifest
+
+
+def getfile_insensitive(path):
+    """Get the actual file path when given insensitive filename."""
+    directory, filename = os.path.split(path)
+    directory, filename = (directory or '.'), filename.lower()
+    for f in os.listdir(directory):
+        newpath = os.path.join(directory, f)
+        if os.path.isfile(newpath) and f.lower() == filename:
+            return newpath
+
+
+def download_multi(url, target_dir, extra_args):
+    """Download multiple files from url to target_dir."""
+    if not os.path.exists(target_dir): os.makedirs(target_dir)
+    print("Downloading %s ..." % url)
+    ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " +
+                         target_dir)
+    return ret_code
+
+
+def download(url, md5sum, target_dir):
+    """Download file from url to target_dir, and check md5sum."""
+    if not os.path.exists(target_dir): os.makedirs(target_dir)
+    filepath = os.path.join(target_dir, url.split("/")[-1])
+    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
+        print("Downloading %s ..." % url)
+        os.system("wget -c " + url + " -P " + target_dir)
+        print("\nMD5 Chesksum %s ..." % filepath)
+        if not md5file(filepath) == md5sum:
+            raise RuntimeError("MD5 checksum failed.")
+    else:
+        print("File exists, skip downloading. (%s)" % filepath)
+    return filepath
+
+
+def unpack(filepath, target_dir, rm_tar=False):
+    """Unpack the file to the target_dir."""
+    print("Unpacking %s ..." % filepath)
+    tar = tarfile.open(filepath)
+    tar.extractall(target_dir)
+    tar.close()
+    if rm_tar == True:
+        os.remove(filepath)
+
+
+class XmapEndSignal():
+    pass