Add files via upload

Just copied the folder and pasted it into the data/librispeech to get the librispeech.py to run properly and avoid the error of numpy not seeing the data_utils.utility module to then import download and unpack. It works now and get the download.
5 years ago · a679daece9
parent b3c728d46f
commit a679daece9
4 changed files with 158 additions and 0 deletions
--- a/data/librispeech/data_utils/data.pyc
+++ b/data/librispeech/data_utils/data.pyc
--- a/data/librispeech/data_utils/featurizer/text_featurizer.py
+++ b/data/librispeech/data_utils/featurizer/text_featurizer.py
@ -0,0 +1,68 @@
 """Contains the text featurizer class."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os
 import codecs
 class TextFeaturizer(object):
    """Text featurizer, for processing or extracting features from text.
    Currently, it only supports char-level tokenizing and conversion into
    a list of token indices. Note that the token indexing order follows the
    given vocabulary file.
    :param vocab_filepath: Filepath to load vocabulary for token indices
                           conversion.
    :type specgram_type: basestring
    """
    def __init__(self, vocab_filepath):
        self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
            vocab_filepath)
    def featurize(self, text):
        """Convert text string to a list of token indices in char-level.Note
        that the token indexing order follows the given vocabulary file.
        :param text: Text to process.
        :type text: basestring
        :return: List of char-level token indices.
        :rtype: list
        """
        tokens = self._char_tokenize(text)
        return [self._vocab_dict[token] for token in tokens]
    @property
    def vocab_size(self):
        """Return the vocabulary size.
        :return: Vocabulary size.
        :rtype: int
        """
        return len(self._vocab_list)
    @property
    def vocab_list(self):
        """Return the vocabulary in list.
        :return: Vocabulary in list.
        :rtype: list
        """
        return self._vocab_list
    def _char_tokenize(self, text):
        """Character tokenizer."""
        return list(text.strip())
    def _load_vocabulary_from_file(self, vocab_filepath):
        """Load vocabulary from file."""
        vocab_lines = []
        with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
            vocab_lines.extend(file.readlines())
        vocab_list = [line[:-1] for line in vocab_lines]
        vocab_dict = dict(
            [(token, id) for (id, token) in enumerate(vocab_list)])
        return vocab_dict, vocab_list
--- a/data/librispeech/data_utils/normalizer.pyc
+++ b/data/librispeech/data_utils/normalizer.pyc
--- a/data/librispeech/data_utils/utility.py
+++ b/data/librispeech/data_utils/utility.py
@ -0,0 +1,90 @@
 """Contains data helper functions."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import json
 import codecs
 import os
 import tarfile
 import time
 from Queue import Queue
 from threading import Thread
 from multiprocessing import Process, Manager, Value
 from paddle.dataset.common import md5file
 def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
    """Load and parse manifest file.
    Instances with durations outside [min_duration, max_duration] will be
    filtered out.
    :param manifest_path: Manifest file to load and parse.
    :type manifest_path: basestring
    :param max_duration: Maximal duration in seconds for instance filter.
    :type max_duration: float
    :param min_duration: Minimal duration in seconds for instance filter.
    :type min_duration: float
    :return: Manifest parsing results. List of dict.
    :rtype: list
    :raises IOError: If failed to parse the manifest.
    """
    manifest = []
    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
        try:
            json_data = json.loads(json_line)
        except Exception as e:
            raise IOError("Error reading manifest: %s" % str(e))
        if (json_data["duration"] <= max_duration and
                json_data["duration"] >= min_duration):
            manifest.append(json_data)
    return manifest
 def getfile_insensitive(path):
    """Get the actual file path when given insensitive filename."""
    directory, filename = os.path.split(path)
    directory, filename = (directory or '.'), filename.lower()
    for f in os.listdir(directory):
        newpath = os.path.join(directory, f)
        if os.path.isfile(newpath) and f.lower() == filename:
            return newpath
 def download_multi(url, target_dir, extra_args):
    """Download multiple files from url to target_dir."""
    if not os.path.exists(target_dir): os.makedirs(target_dir)
    print("Downloading %s ..." % url)
    ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " +
                         target_dir)
    return ret_code
 def download(url, md5sum, target_dir):
    """Download file from url to target_dir, and check md5sum."""
    if not os.path.exists(target_dir): os.makedirs(target_dir)
    filepath = os.path.join(target_dir, url.split("/")[-1])
    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
        print("Downloading %s ..." % url)
        os.system("wget -c " + url + " -P " + target_dir)
        print("\nMD5 Chesksum %s ..." % filepath)
        if not md5file(filepath) == md5sum:
            raise RuntimeError("MD5 checksum failed.")
    else:
        print("File exists, skip downloading. (%s)" % filepath)
    return filepath
 def unpack(filepath, target_dir, rm_tar=False):
    """Unpack the file to the target_dir."""
    print("Unpacking %s ..." % filepath)
    tar = tarfile.open(filepath)
    tar.extractall(target_dir)
    tar.close()
    if rm_tar == True:
        os.remove(filepath)
 class XmapEndSignal():
    pass