fix musan and rir dataset

5 years ago · 1ae41eac90
parent 45f73c507c
commit 1ae41eac90
5 changed files with 39 additions and 53 deletions
--- a/examples/dataset/musan/.gitignore
+++ b/examples/dataset/musan/.gitignore
@ -0,0 +1,5 @@
+manifest.music
+manifest.noise
+manifest.speech
+musan/
+musan.tar.gz
--- a/examples/dataset/musan/musan.py
+++ b/examples/dataset/musan/musan.py
@ -33,7 +33,7 @@ DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

 URL_ROOT = 'https://www.openslr.org/resources/17'
 DATA_URL = URL_ROOT + '/musan.tar.gz'
-MD5_DATA = ''
+MD5_DATA = '0c472d4fc0c5141eca47ad1ffeb2a7df'

 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
@ -52,37 +52,24 @@ args = parser.parse_args()
 def create_manifest(data_dir, manifest_path_prefix):
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []
-    transcript_path = os.path.join(data_dir, 'transcript',
-                                   'aishell_transcript_v0.8.txt')
-    transcript_dict = {}
-    for line in codecs.open(transcript_path, 'r', 'utf-8'):
-        line = line.strip()
-        if line == '': continue
-        audio_id, text = line.split(' ', 1)
-        # remove withespace
-        text = ''.join(text.split())
-        transcript_dict[audio_id] = text
-
-    data_types = ['train', 'dev', 'test']
+    data_types = ['music', 'noise', 'speech']
    for type in data_types:
        del json_lines[:]
-        audio_dir = os.path.join(data_dir, 'wav', type)
+        audio_dir = os.path.join(data_dir, type)
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            print('x, ', subfolder)
            for fname in filelist:
                audio_path = os.path.join(subfolder, fname)
-                audio_id = fname[:-4]
-                # if no transcription for audio then skipped
-                if audio_id not in transcript_dict:
+                if not audio_path.endswith('.wav'):
                    continue
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
-                text = transcript_dict[audio_id]
                json_lines.append(
                    json.dumps(
                        {
                            'audio_filepath': audio_path,
                            'duration': duration,
-                            'text': text
+                            'type': type,
                        },
                        ensure_ascii=False))
        manifest_path = manifest_path_prefix + '.' + type
@ -93,15 +80,10 @@ def create_manifest(data_dir, manifest_path_prefix):

 def prepare_dataset(url, md5sum, target_dir, manifest_path):
    """Download, unpack and create manifest file."""
-    data_dir = os.path.join(target_dir, 'data_aishell')
+    data_dir = os.path.join(target_dir, 'musan')
    if not os.path.exists(data_dir):
        filepath = download(url, md5sum, target_dir)
        unpack(filepath, target_dir)
-        # unpack all audio tar files
-        audio_dir = os.path.join(data_dir, 'wav')
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for ftar in filelist:
-                unpack(os.path.join(subfolder, ftar), subfolder, True)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
--- a/examples/dataset/rir_noise/.gitignore
+++ b/examples/dataset/rir_noise/.gitignore
@ -0,0 +1,5 @@
+RIRS_NOISES/
+manifest.pointsource_noises
+manifest.real_rirs_isotropic_noises
+manifest.simulated_rirs
+rirs_noises.zip
--- a/examples/dataset/rir_noise/rir_noise.py
+++ b/examples/dataset/rir_noise/rir_noise.py
@ -27,13 +27,13 @@ import codecs
 import soundfile
 import json
 import argparse
-from data_utils.utility import download, unpack
+from utils.utility import download, unpack, unzip

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

 URL_ROOT = 'http://www.openslr.org/resources/28'
 DATA_URL = URL_ROOT + '/rirs_noises.zip'
-MD5_DATA = ''
+MD5_DATA = 'e6f48e257286e05de56413b4779d8ffb'

 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
@ -52,37 +52,25 @@ args = parser.parse_args()
 def create_manifest(data_dir, manifest_path_prefix):
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []
-    transcript_path = os.path.join(data_dir, 'transcript',
-                                   'aishell_transcript_v0.8.txt')
-    transcript_dict = {}
-    for line in codecs.open(transcript_path, 'r', 'utf-8'):
-        line = line.strip()
-        if line == '': continue
-        audio_id, text = line.split(' ', 1)
-        # remove withespace
-        text = ''.join(text.split())
-        transcript_dict[audio_id] = text
-
-    data_types = ['train', 'dev', 'test']
+    data_types = [
+        'pointsource_noises', 'real_rirs_isotropic_noises', 'simulated_rirs'
+    ]
    for type in data_types:
        del json_lines[:]
-        audio_dir = os.path.join(data_dir, 'wav', type)
+        audio_dir = os.path.join(data_dir, type)
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for fname in filelist:
                audio_path = os.path.join(subfolder, fname)
-                audio_id = fname[:-4]
-                # if no transcription for audio then skipped
-                if audio_id not in transcript_dict:
+                if not audio_path.endswith('.wav'):
                    continue
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
-                text = transcript_dict[audio_id]
                json_lines.append(
                    json.dumps(
                        {
                            'audio_filepath': audio_path,
                            'duration': duration,
-                            'text': text
+                            'type': type,
                        },
                        ensure_ascii=False))
        manifest_path = manifest_path_prefix + '.' + type
@ -92,16 +80,11 @@ def create_manifest(data_dir, manifest_path_prefix):


 def prepare_dataset(url, md5sum, target_dir, manifest_path):
-    """Download, unpack and create manifest file."""
-    data_dir = os.path.join(target_dir, 'data_aishell')
+    """Download, unzip and create manifest file."""
+    data_dir = os.path.join(target_dir, 'RIRS_NOISES')
    if not os.path.exists(data_dir):
        filepath = download(url, md5sum, target_dir)
-        unpack(filepath, target_dir)
-        # unpack all audio tar files
-        audio_dir = os.path.join(data_dir, 'wav')
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for ftar in filelist:
-                unpack(os.path.join(subfolder, ftar), subfolder, True)
+        unzip(filepath, target_dir)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
--- a/utils/utility.py
+++ b/utils/utility.py
@ -14,6 +14,7 @@

 import os
 import tarfile
+import zipfile
 from paddle.dataset.common import md5file


@ -59,3 +60,13 @@ def unpack(filepath, target_dir, rm_tar=False):
    tar.close()
    if rm_tar == True:
        os.remove(filepath)
+
+
+def unzip(filepath, target_dir, rm_tar=False):
+    """Unzip the file to the target_dir."""
+    print("Unpacking %s ..." % filepath)
+    tar = zipfile.ZipFile(filepath, 'r')
+    tar.extractall(target_dir)
+    tar.close()
+    if rm_tar == True:
+        os.remove(filepath)