fix musan and rir dataset

pull/538/head
Hui Zhang 5 years ago
parent 45f73c507c
commit 1ae41eac90

@ -0,0 +1,5 @@
manifest.music
manifest.noise
manifest.speech
musan/
musan.tar.gz

@ -33,7 +33,7 @@ DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
URL_ROOT = 'https://www.openslr.org/resources/17' URL_ROOT = 'https://www.openslr.org/resources/17'
DATA_URL = URL_ROOT + '/musan.tar.gz' DATA_URL = URL_ROOT + '/musan.tar.gz'
MD5_DATA = '' MD5_DATA = '0c472d4fc0c5141eca47ad1ffeb2a7df'
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( parser.add_argument(
@ -52,37 +52,24 @@ args = parser.parse_args()
def create_manifest(data_dir, manifest_path_prefix): def create_manifest(data_dir, manifest_path_prefix):
print("Creating manifest %s ..." % manifest_path_prefix) print("Creating manifest %s ..." % manifest_path_prefix)
json_lines = [] json_lines = []
transcript_path = os.path.join(data_dir, 'transcript', data_types = ['music', 'noise', 'speech']
'aishell_transcript_v0.8.txt')
transcript_dict = {}
for line in codecs.open(transcript_path, 'r', 'utf-8'):
line = line.strip()
if line == '': continue
audio_id, text = line.split(' ', 1)
# remove withespace
text = ''.join(text.split())
transcript_dict[audio_id] = text
data_types = ['train', 'dev', 'test']
for type in data_types: for type in data_types:
del json_lines[:] del json_lines[:]
audio_dir = os.path.join(data_dir, 'wav', type) audio_dir = os.path.join(data_dir, type)
for subfolder, _, filelist in sorted(os.walk(audio_dir)): for subfolder, _, filelist in sorted(os.walk(audio_dir)):
print('x, ', subfolder)
for fname in filelist: for fname in filelist:
audio_path = os.path.join(subfolder, fname) audio_path = os.path.join(subfolder, fname)
audio_id = fname[:-4] if not audio_path.endswith('.wav'):
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
continue continue
audio_data, samplerate = soundfile.read(audio_path) audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate) duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id]
json_lines.append( json_lines.append(
json.dumps( json.dumps(
{ {
'audio_filepath': audio_path, 'audio_filepath': audio_path,
'duration': duration, 'duration': duration,
'text': text 'type': type,
}, },
ensure_ascii=False)) ensure_ascii=False))
manifest_path = manifest_path_prefix + '.' + type manifest_path = manifest_path_prefix + '.' + type
@ -93,15 +80,10 @@ def create_manifest(data_dir, manifest_path_prefix):
def prepare_dataset(url, md5sum, target_dir, manifest_path): def prepare_dataset(url, md5sum, target_dir, manifest_path):
"""Download, unpack and create manifest file.""" """Download, unpack and create manifest file."""
data_dir = os.path.join(target_dir, 'data_aishell') data_dir = os.path.join(target_dir, 'musan')
if not os.path.exists(data_dir): if not os.path.exists(data_dir):
filepath = download(url, md5sum, target_dir) filepath = download(url, md5sum, target_dir)
unpack(filepath, target_dir) unpack(filepath, target_dir)
# unpack all audio tar files
audio_dir = os.path.join(data_dir, 'wav')
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for ftar in filelist:
unpack(os.path.join(subfolder, ftar), subfolder, True)
else: else:
print("Skip downloading and unpacking. Data already exists in %s." % print("Skip downloading and unpacking. Data already exists in %s." %
target_dir) target_dir)

@ -0,0 +1,5 @@
RIRS_NOISES/
manifest.pointsource_noises
manifest.real_rirs_isotropic_noises
manifest.simulated_rirs
rirs_noises.zip

@ -27,13 +27,13 @@ import codecs
import soundfile import soundfile
import json import json
import argparse import argparse
from data_utils.utility import download, unpack from utils.utility import download, unpack, unzip
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
URL_ROOT = 'http://www.openslr.org/resources/28' URL_ROOT = 'http://www.openslr.org/resources/28'
DATA_URL = URL_ROOT + '/rirs_noises.zip' DATA_URL = URL_ROOT + '/rirs_noises.zip'
MD5_DATA = '' MD5_DATA = 'e6f48e257286e05de56413b4779d8ffb'
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( parser.add_argument(
@ -52,37 +52,25 @@ args = parser.parse_args()
def create_manifest(data_dir, manifest_path_prefix): def create_manifest(data_dir, manifest_path_prefix):
print("Creating manifest %s ..." % manifest_path_prefix) print("Creating manifest %s ..." % manifest_path_prefix)
json_lines = [] json_lines = []
transcript_path = os.path.join(data_dir, 'transcript', data_types = [
'aishell_transcript_v0.8.txt') 'pointsource_noises', 'real_rirs_isotropic_noises', 'simulated_rirs'
transcript_dict = {} ]
for line in codecs.open(transcript_path, 'r', 'utf-8'):
line = line.strip()
if line == '': continue
audio_id, text = line.split(' ', 1)
# remove withespace
text = ''.join(text.split())
transcript_dict[audio_id] = text
data_types = ['train', 'dev', 'test']
for type in data_types: for type in data_types:
del json_lines[:] del json_lines[:]
audio_dir = os.path.join(data_dir, 'wav', type) audio_dir = os.path.join(data_dir, type)
for subfolder, _, filelist in sorted(os.walk(audio_dir)): for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for fname in filelist: for fname in filelist:
audio_path = os.path.join(subfolder, fname) audio_path = os.path.join(subfolder, fname)
audio_id = fname[:-4] if not audio_path.endswith('.wav'):
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
continue continue
audio_data, samplerate = soundfile.read(audio_path) audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate) duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id]
json_lines.append( json_lines.append(
json.dumps( json.dumps(
{ {
'audio_filepath': audio_path, 'audio_filepath': audio_path,
'duration': duration, 'duration': duration,
'text': text 'type': type,
}, },
ensure_ascii=False)) ensure_ascii=False))
manifest_path = manifest_path_prefix + '.' + type manifest_path = manifest_path_prefix + '.' + type
@ -92,16 +80,11 @@ def create_manifest(data_dir, manifest_path_prefix):
def prepare_dataset(url, md5sum, target_dir, manifest_path): def prepare_dataset(url, md5sum, target_dir, manifest_path):
"""Download, unpack and create manifest file.""" """Download, unzip and create manifest file."""
data_dir = os.path.join(target_dir, 'data_aishell') data_dir = os.path.join(target_dir, 'RIRS_NOISES')
if not os.path.exists(data_dir): if not os.path.exists(data_dir):
filepath = download(url, md5sum, target_dir) filepath = download(url, md5sum, target_dir)
unpack(filepath, target_dir) unzip(filepath, target_dir)
# unpack all audio tar files
audio_dir = os.path.join(data_dir, 'wav')
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for ftar in filelist:
unpack(os.path.join(subfolder, ftar), subfolder, True)
else: else:
print("Skip downloading and unpacking. Data already exists in %s." % print("Skip downloading and unpacking. Data already exists in %s." %
target_dir) target_dir)

@ -14,6 +14,7 @@
import os import os
import tarfile import tarfile
import zipfile
from paddle.dataset.common import md5file from paddle.dataset.common import md5file
@ -59,3 +60,13 @@ def unpack(filepath, target_dir, rm_tar=False):
tar.close() tar.close()
if rm_tar == True: if rm_tar == True:
os.remove(filepath) os.remove(filepath)
def unzip(filepath, target_dir, rm_tar=False):
"""Unzip the file to the target_dir."""
print("Unpacking %s ..." % filepath)
tar = zipfile.ZipFile(filepath, 'r')
tar.extractall(target_dir)
tar.close()
if rm_tar == True:
os.remove(filepath)

Loading…
Cancel
Save