diff --git a/data/librispeech/data_utils/data.pyc b/data/librispeech/data_utils/data.pyc new file mode 100644 index 000000000..7daa7a484 Binary files /dev/null and b/data/librispeech/data_utils/data.pyc differ diff --git a/data/librispeech/data_utils/featurizer/text_featurizer.py b/data/librispeech/data_utils/featurizer/text_featurizer.py new file mode 100644 index 000000000..89202163c --- /dev/null +++ b/data/librispeech/data_utils/featurizer/text_featurizer.py @@ -0,0 +1,68 @@ +"""Contains the text featurizer class.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import codecs + + +class TextFeaturizer(object): + """Text featurizer, for processing or extracting features from text. + + Currently, it only supports char-level tokenizing and conversion into + a list of token indices. Note that the token indexing order follows the + given vocabulary file. + + :param vocab_filepath: Filepath to load vocabulary for token indices + conversion. + :type specgram_type: basestring + """ + + def __init__(self, vocab_filepath): + self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file( + vocab_filepath) + + def featurize(self, text): + """Convert text string to a list of token indices in char-level.Note + that the token indexing order follows the given vocabulary file. + + :param text: Text to process. + :type text: basestring + :return: List of char-level token indices. + :rtype: list + """ + tokens = self._char_tokenize(text) + return [self._vocab_dict[token] for token in tokens] + + @property + def vocab_size(self): + """Return the vocabulary size. + + :return: Vocabulary size. + :rtype: int + """ + return len(self._vocab_list) + + @property + def vocab_list(self): + """Return the vocabulary in list. + + :return: Vocabulary in list. + :rtype: list + """ + return self._vocab_list + + def _char_tokenize(self, text): + """Character tokenizer.""" + return list(text.strip()) + + def _load_vocabulary_from_file(self, vocab_filepath): + """Load vocabulary from file.""" + vocab_lines = [] + with codecs.open(vocab_filepath, 'r', 'utf-8') as file: + vocab_lines.extend(file.readlines()) + vocab_list = [line[:-1] for line in vocab_lines] + vocab_dict = dict( + [(token, id) for (id, token) in enumerate(vocab_list)]) + return vocab_dict, vocab_list diff --git a/data/librispeech/data_utils/normalizer.pyc b/data/librispeech/data_utils/normalizer.pyc new file mode 100644 index 000000000..5364f86a4 Binary files /dev/null and b/data/librispeech/data_utils/normalizer.pyc differ diff --git a/data/librispeech/data_utils/utility.py b/data/librispeech/data_utils/utility.py new file mode 100644 index 000000000..7143f7ded --- /dev/null +++ b/data/librispeech/data_utils/utility.py @@ -0,0 +1,90 @@ +"""Contains data helper functions.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import codecs +import os +import tarfile +import time +from Queue import Queue +from threading import Thread +from multiprocessing import Process, Manager, Value +from paddle.dataset.common import md5file + + +def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): + """Load and parse manifest file. + + Instances with durations outside [min_duration, max_duration] will be + filtered out. + + :param manifest_path: Manifest file to load and parse. + :type manifest_path: basestring + :param max_duration: Maximal duration in seconds for instance filter. + :type max_duration: float + :param min_duration: Minimal duration in seconds for instance filter. + :type min_duration: float + :return: Manifest parsing results. List of dict. + :rtype: list + :raises IOError: If failed to parse the manifest. + """ + manifest = [] + for json_line in codecs.open(manifest_path, 'r', 'utf-8'): + try: + json_data = json.loads(json_line) + except Exception as e: + raise IOError("Error reading manifest: %s" % str(e)) + if (json_data["duration"] <= max_duration and + json_data["duration"] >= min_duration): + manifest.append(json_data) + return manifest + + +def getfile_insensitive(path): + """Get the actual file path when given insensitive filename.""" + directory, filename = os.path.split(path) + directory, filename = (directory or '.'), filename.lower() + for f in os.listdir(directory): + newpath = os.path.join(directory, f) + if os.path.isfile(newpath) and f.lower() == filename: + return newpath + + +def download_multi(url, target_dir, extra_args): + """Download multiple files from url to target_dir.""" + if not os.path.exists(target_dir): os.makedirs(target_dir) + print("Downloading %s ..." % url) + ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " + + target_dir) + return ret_code + + +def download(url, md5sum, target_dir): + """Download file from url to target_dir, and check md5sum.""" + if not os.path.exists(target_dir): os.makedirs(target_dir) + filepath = os.path.join(target_dir, url.split("/")[-1]) + if not (os.path.exists(filepath) and md5file(filepath) == md5sum): + print("Downloading %s ..." % url) + os.system("wget -c " + url + " -P " + target_dir) + print("\nMD5 Chesksum %s ..." % filepath) + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) + return filepath + + +def unpack(filepath, target_dir, rm_tar=False): + """Unpack the file to the target_dir.""" + print("Unpacking %s ..." % filepath) + tar = tarfile.open(filepath) + tar.extractall(target_dir) + tar.close() + if rm_tar == True: + os.remove(filepath) + + +class XmapEndSignal(): + pass