Just copied the folder and pasted it into the data/librispeech to get the librispeech.py to run properly and avoid the error of numpy not seeing the data_utils.utility module to then import download and unpack. It works now and get the download.pull/499/head
parent
b3c728d46f
commit
a679daece9
Binary file not shown.
@ -0,0 +1,68 @@
|
|||||||
|
"""Contains the text featurizer class."""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
import codecs
|
||||||
|
|
||||||
|
|
||||||
|
class TextFeaturizer(object):
|
||||||
|
"""Text featurizer, for processing or extracting features from text.
|
||||||
|
|
||||||
|
Currently, it only supports char-level tokenizing and conversion into
|
||||||
|
a list of token indices. Note that the token indexing order follows the
|
||||||
|
given vocabulary file.
|
||||||
|
|
||||||
|
:param vocab_filepath: Filepath to load vocabulary for token indices
|
||||||
|
conversion.
|
||||||
|
:type specgram_type: basestring
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, vocab_filepath):
|
||||||
|
self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
|
||||||
|
vocab_filepath)
|
||||||
|
|
||||||
|
def featurize(self, text):
|
||||||
|
"""Convert text string to a list of token indices in char-level.Note
|
||||||
|
that the token indexing order follows the given vocabulary file.
|
||||||
|
|
||||||
|
:param text: Text to process.
|
||||||
|
:type text: basestring
|
||||||
|
:return: List of char-level token indices.
|
||||||
|
:rtype: list
|
||||||
|
"""
|
||||||
|
tokens = self._char_tokenize(text)
|
||||||
|
return [self._vocab_dict[token] for token in tokens]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_size(self):
|
||||||
|
"""Return the vocabulary size.
|
||||||
|
|
||||||
|
:return: Vocabulary size.
|
||||||
|
:rtype: int
|
||||||
|
"""
|
||||||
|
return len(self._vocab_list)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_list(self):
|
||||||
|
"""Return the vocabulary in list.
|
||||||
|
|
||||||
|
:return: Vocabulary in list.
|
||||||
|
:rtype: list
|
||||||
|
"""
|
||||||
|
return self._vocab_list
|
||||||
|
|
||||||
|
def _char_tokenize(self, text):
|
||||||
|
"""Character tokenizer."""
|
||||||
|
return list(text.strip())
|
||||||
|
|
||||||
|
def _load_vocabulary_from_file(self, vocab_filepath):
|
||||||
|
"""Load vocabulary from file."""
|
||||||
|
vocab_lines = []
|
||||||
|
with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
|
||||||
|
vocab_lines.extend(file.readlines())
|
||||||
|
vocab_list = [line[:-1] for line in vocab_lines]
|
||||||
|
vocab_dict = dict(
|
||||||
|
[(token, id) for (id, token) in enumerate(vocab_list)])
|
||||||
|
return vocab_dict, vocab_list
|
Binary file not shown.
@ -0,0 +1,90 @@
|
|||||||
|
"""Contains data helper functions."""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import json
|
||||||
|
import codecs
|
||||||
|
import os
|
||||||
|
import tarfile
|
||||||
|
import time
|
||||||
|
from Queue import Queue
|
||||||
|
from threading import Thread
|
||||||
|
from multiprocessing import Process, Manager, Value
|
||||||
|
from paddle.dataset.common import md5file
|
||||||
|
|
||||||
|
|
||||||
|
def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
|
||||||
|
"""Load and parse manifest file.
|
||||||
|
|
||||||
|
Instances with durations outside [min_duration, max_duration] will be
|
||||||
|
filtered out.
|
||||||
|
|
||||||
|
:param manifest_path: Manifest file to load and parse.
|
||||||
|
:type manifest_path: basestring
|
||||||
|
:param max_duration: Maximal duration in seconds for instance filter.
|
||||||
|
:type max_duration: float
|
||||||
|
:param min_duration: Minimal duration in seconds for instance filter.
|
||||||
|
:type min_duration: float
|
||||||
|
:return: Manifest parsing results. List of dict.
|
||||||
|
:rtype: list
|
||||||
|
:raises IOError: If failed to parse the manifest.
|
||||||
|
"""
|
||||||
|
manifest = []
|
||||||
|
for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
|
||||||
|
try:
|
||||||
|
json_data = json.loads(json_line)
|
||||||
|
except Exception as e:
|
||||||
|
raise IOError("Error reading manifest: %s" % str(e))
|
||||||
|
if (json_data["duration"] <= max_duration and
|
||||||
|
json_data["duration"] >= min_duration):
|
||||||
|
manifest.append(json_data)
|
||||||
|
return manifest
|
||||||
|
|
||||||
|
|
||||||
|
def getfile_insensitive(path):
|
||||||
|
"""Get the actual file path when given insensitive filename."""
|
||||||
|
directory, filename = os.path.split(path)
|
||||||
|
directory, filename = (directory or '.'), filename.lower()
|
||||||
|
for f in os.listdir(directory):
|
||||||
|
newpath = os.path.join(directory, f)
|
||||||
|
if os.path.isfile(newpath) and f.lower() == filename:
|
||||||
|
return newpath
|
||||||
|
|
||||||
|
|
||||||
|
def download_multi(url, target_dir, extra_args):
|
||||||
|
"""Download multiple files from url to target_dir."""
|
||||||
|
if not os.path.exists(target_dir): os.makedirs(target_dir)
|
||||||
|
print("Downloading %s ..." % url)
|
||||||
|
ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " +
|
||||||
|
target_dir)
|
||||||
|
return ret_code
|
||||||
|
|
||||||
|
|
||||||
|
def download(url, md5sum, target_dir):
|
||||||
|
"""Download file from url to target_dir, and check md5sum."""
|
||||||
|
if not os.path.exists(target_dir): os.makedirs(target_dir)
|
||||||
|
filepath = os.path.join(target_dir, url.split("/")[-1])
|
||||||
|
if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
|
||||||
|
print("Downloading %s ..." % url)
|
||||||
|
os.system("wget -c " + url + " -P " + target_dir)
|
||||||
|
print("\nMD5 Chesksum %s ..." % filepath)
|
||||||
|
if not md5file(filepath) == md5sum:
|
||||||
|
raise RuntimeError("MD5 checksum failed.")
|
||||||
|
else:
|
||||||
|
print("File exists, skip downloading. (%s)" % filepath)
|
||||||
|
return filepath
|
||||||
|
|
||||||
|
|
||||||
|
def unpack(filepath, target_dir, rm_tar=False):
|
||||||
|
"""Unpack the file to the target_dir."""
|
||||||
|
print("Unpacking %s ..." % filepath)
|
||||||
|
tar = tarfile.open(filepath)
|
||||||
|
tar.extractall(target_dir)
|
||||||
|
tar.close()
|
||||||
|
if rm_tar == True:
|
||||||
|
os.remove(filepath)
|
||||||
|
|
||||||
|
|
||||||
|
class XmapEndSignal():
|
||||||
|
pass
|
Loading…
Reference in new issue