Extract common utility functions.

pull/2/head
yangyaming 7 years ago
parent 69e0d86ddb
commit d1420d121e

@ -12,12 +12,12 @@ from __future__ import print_function
import distutils.util
import os
import sys
import tarfile
import argparse
import soundfile
import json
import codecs
from paddle.v2.dataset.common import md5file
from data_utils.utility import download, unpack
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
@ -59,33 +59,6 @@ parser.add_argument(
args = parser.parse_args()
def download(url, md5sum, target_dir):
"""
Download file from url to target_dir, and check md5sum.
"""
if not os.path.exists(target_dir): os.makedirs(target_dir)
filepath = os.path.join(target_dir, url.split("/")[-1])
if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
print("Downloading %s ..." % url)
os.system("wget -c " + url + " -P " + target_dir)
print("\nMD5 Chesksum %s ..." % filepath)
if not md5file(filepath) == md5sum:
raise RuntimeError("MD5 checksum failed.")
else:
print("File exists, skip downloading. (%s)" % filepath)
return filepath
def unpack(filepath, target_dir):
"""
Unpack the file to the target_dir.
"""
print("Unpacking %s ..." % filepath)
tar = tarfile.open(filepath)
tar.extractall(target_dir)
tar.close()
def create_manifest(data_dir, manifest_path):
"""
Create a manifest json file summarizing the data set, with each line

@ -5,6 +5,8 @@ from __future__ import print_function
import json
import codecs
import os
import tarfile
def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
@ -33,3 +35,28 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
json_data["duration"] >= min_duration):
manifest.append(json_data)
return manifest
def download(url, md5sum, target_dir):
"""Download file from url to target_dir, and check md5sum."""
if not os.path.exists(target_dir): os.makedirs(target_dir)
filepath = os.path.join(target_dir, url.split("/")[-1])
if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
print("Downloading %s ..." % url)
os.system("wget -c " + url + " -P " + target_dir)
print("\nMD5 Chesksum %s ..." % filepath)
if not md5file(filepath) == md5sum:
raise RuntimeError("MD5 checksum failed.")
else:
print("File exists, skip downloading. (%s)" % filepath)
return filepath
def unpack(filepath, target_dir, rm_tar=False):
"""Unpack the file to the target_dir."""
print("Unpacking %s ..." % filepath)
tar = tarfile.open(filepath)
tar.extractall(target_dir)
tar.close()
if rm_tar == True:
os.remove(filepath)

Loading…
Cancel
Save