[s2t] mv dataset into paddlespeech.dataset (#3183)

* mv dataset into paddlespeech.dataset * add aidatatang * fix import
3 years ago · 35d874c532
parent 3ad55a31e7
commit 35d874c532
27 changed files with 619 additions and 387 deletions
--- a/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/dataset/aidatatang_200zh/aidatatang_200zh.py
@ -18,139 +18,7 @@ Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'http://www.openslr.org/resources/62'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
-DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
-MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--target_dir",
-    default=DATA_HOME + "/aidatatang_200zh",
-    type=str,
-    help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
-    "--manifest_prefix",
-    default="manifest",
-    type=str,
-    help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
-    json_lines = []
-    transcript_path = os.path.join(data_dir, 'transcript',
-                                   'aidatatang_200_zh_transcript.txt')
-    transcript_dict = {}
-    for line in codecs.open(transcript_path, 'r', 'utf-8'):
-        line = line.strip()
-        if line == '':
-            continue
-        audio_id, text = line.split(' ', 1)
-        # remove withespace, charactor text
-        text = ''.join(text.split())
-        transcript_dict[audio_id] = text
-
-    data_types = ['train', 'dev', 'test']
-    for dtype in data_types:
-        del json_lines[:]
-        total_sec = 0.0
-        total_text = 0.0
-        total_num = 0
-
-        audio_dir = os.path.join(data_dir, 'corpus/', dtype)
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for fname in filelist:
-                if not fname.endswith('.wav'):
-                    continue
-
-                audio_path = os.path.abspath(os.path.join(subfolder, fname))
-                audio_id = os.path.basename(fname)[:-4]
-                utt2spk = Path(audio_path).parent.name
-
-                audio_data, samplerate = soundfile.read(audio_path)
-                duration = float(len(audio_data) / samplerate)
-                text = transcript_dict[audio_id]
-                json_lines.append(
-                    json.dumps(
-                        {
-                            'utt': audio_id,
-                            'utt2spk': str(utt2spk),
-                            'feat': audio_path,
-                            'feat_shape': (duration, ),  # second
-                            'text': text,
-                        },
-                        ensure_ascii=False))
-
-                total_sec += duration
-                total_text += len(text)
-                total_num += 1
-
-        manifest_path = manifest_path_prefix + '.' + dtype
-        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
-            for line in json_lines:
-                fout.write(line + '\n')
-
-        manifest_dir = os.path.dirname(manifest_path_prefix)
-        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
-        with open(meta_path, 'w') as f:
-            print(f"{dtype}:", file=f)
-            print(f"{total_num} utts", file=f)
-            print(f"{total_sec / (60*60)} h", file=f)
-            print(f"{total_text} text", file=f)
-            print(f"{total_text / total_sec} text/sec", file=f)
-            print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
-    """Download, unpack and create manifest file."""
-    data_dir = os.path.join(target_dir, subset)
-    if not os.path.exists(data_dir):
-        filepath = download(url, md5sum, target_dir)
-        unpack(filepath, target_dir)
-        # unpack all audio tar files
-        audio_dir = os.path.join(data_dir, 'corpus')
-        for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
-            for sub in dirlist:
-                print(f"unpack dir {sub}...")
-                for folder, _, filelist in sorted(
-                        os.walk(os.path.join(subfolder, sub))):
-                    for ftar in filelist:
-                        unpack(os.path.join(folder, ftar), folder, True)
-    else:
-        print("Skip downloading and unpacking. Data already exists in %s." %
-              target_dir)
-
-    create_manifest(data_dir, manifest_path)
-
-
-def main():
-    if args.target_dir.startswith('~'):
-        args.target_dir = os.path.expanduser(args.target_dir)
-
-    prepare_dataset(
-        url=DATA_URL,
-        md5sum=MD5_DATA,
-        target_dir=args.target_dir,
-        manifest_path=args.manifest_prefix,
-        subset='aidatatang_200zh')
-
-    print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aidatatang_200zh import aidatatang_200zh_main

 if __name__ == '__main__':
-    main()
+    aidatatang_200zh_main()
--- a/dataset/aishell/README.md
+++ b/dataset/aishell/README.md
@ -1,3 +0,0 @@
-# [Aishell1](http://openslr.elda.org/33/)
-
-This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
--- a/dataset/aishell/aishell.py
+++ b/dataset/aishell/aishell.py
@ -18,143 +18,7 @@ Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'http://openslr.elda.org/resources/33'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
-DATA_URL = URL_ROOT + '/data_aishell.tgz'
-MD5_DATA = '2f494334227864a8a8fec932999db9d8'
-RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
-MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--target_dir",
-    default=DATA_HOME + "/Aishell",
-    type=str,
-    help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
-    "--manifest_prefix",
-    default="manifest",
-    type=str,
-    help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
-    json_lines = []
-    transcript_path = os.path.join(data_dir, 'transcript',
-                                   'aishell_transcript_v0.8.txt')
-    transcript_dict = {}
-    for line in codecs.open(transcript_path, 'r', 'utf-8'):
-        line = line.strip()
-        if line == '':
-            continue
-        audio_id, text = line.split(' ', 1)
-        # remove withespace, charactor text
-        text = ''.join(text.split())
-        transcript_dict[audio_id] = text
-
-    data_types = ['train', 'dev', 'test']
-    for dtype in data_types:
-        del json_lines[:]
-        total_sec = 0.0
-        total_text = 0.0
-        total_num = 0
-
-        audio_dir = os.path.join(data_dir, 'wav', dtype)
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for fname in filelist:
-                audio_path = os.path.abspath(os.path.join(subfolder, fname))
-                audio_id = os.path.basename(fname)[:-4]
-                # if no transcription for audio then skipped
-                if audio_id not in transcript_dict:
-                    continue
-
-                utt2spk = Path(audio_path).parent.name
-                audio_data, samplerate = soundfile.read(audio_path)
-                duration = float(len(audio_data) / samplerate)
-                text = transcript_dict[audio_id]
-                json_lines.append(
-                    json.dumps(
-                        {
-                            'utt': audio_id,
-                            'utt2spk': str(utt2spk),
-                            'feat': audio_path,
-                            'feat_shape': (duration, ),  # second
-                            'text': text
-                        },
-                        ensure_ascii=False))
-
-                total_sec += duration
-                total_text += len(text)
-                total_num += 1
-
-        manifest_path = manifest_path_prefix + '.' + dtype
-        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
-            for line in json_lines:
-                fout.write(line + '\n')
-
-        manifest_dir = os.path.dirname(manifest_path_prefix)
-        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
-        with open(meta_path, 'w') as f:
-            print(f"{dtype}:", file=f)
-            print(f"{total_num} utts", file=f)
-            print(f"{total_sec / (60*60)} h", file=f)
-            print(f"{total_text} text", file=f)
-            print(f"{total_text / total_sec} text/sec", file=f)
-            print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
-    """Download, unpack and create manifest file."""
-    data_dir = os.path.join(target_dir, 'data_aishell')
-    if not os.path.exists(data_dir):
-        filepath = download(url, md5sum, target_dir)
-        unpack(filepath, target_dir)
-        # unpack all audio tar files
-        audio_dir = os.path.join(data_dir, 'wav')
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for ftar in filelist:
-                unpack(os.path.join(subfolder, ftar), subfolder, True)
-    else:
-        print("Skip downloading and unpacking. Data already exists in %s." %
-              target_dir)
-
-    if manifest_path:
-        create_manifest(data_dir, manifest_path)
-
-
-def main():
-    if args.target_dir.startswith('~'):
-        args.target_dir = os.path.expanduser(args.target_dir)
-
-    prepare_dataset(
-        url=DATA_URL,
-        md5sum=MD5_DATA,
-        target_dir=args.target_dir,
-        manifest_path=args.manifest_prefix)
-
-    prepare_dataset(
-        url=RESOURCE_URL,
-        md5sum=MD5_RESOURCE,
-        target_dir=args.target_dir,
-        manifest_path=None)
-
-    print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aishell import aishell_main

 if __name__ == '__main__':
-    main()
+    aishell_main()
--- a/dataset/librispeech/librispeech.py
+++ b/dataset/librispeech/librispeech.py
@ -28,8 +28,8 @@ from multiprocessing.pool import Pool
 import distutils.util
 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 URL_ROOT = "http://openslr.elda.org/resources/12"
 #URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
--- a/dataset/mini_librispeech/mini_librispeech.py
+++ b/dataset/mini_librispeech/mini_librispeech.py
@ -27,8 +27,8 @@ from multiprocessing.pool import Pool

 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 URL_ROOT = "http://openslr.elda.org/resources/31"
 URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz"
--- a/dataset/musan/musan.py
+++ b/dataset/musan/musan.py
@ -29,8 +29,8 @@ import os

 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

--- a/dataset/rir_noise/rir_noise.py
+++ b/dataset/rir_noise/rir_noise.py
@ -29,8 +29,8 @@ import os

 import soundfile

-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

--- a/dataset/thchs30/thchs30.py
+++ b/dataset/thchs30/thchs30.py
@ -27,8 +27,8 @@ from pathlib import Path

 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

--- a/dataset/timit/timit.py
+++ b/dataset/timit/timit.py
@ -28,7 +28,7 @@ from pathlib import Path

 import soundfile

-from utils.utility import unzip
+from paddlespeech.dataset.download import unzip

 URL_ROOT = ""
 MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d"
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@ -31,9 +31,9 @@ from pathlib import Path

 import soundfile

-from utils.utility import check_md5sum
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import check_md5sum
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip

 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')
--- a/dataset/voxceleb/voxceleb2.py
+++ b/dataset/voxceleb/voxceleb2.py
@ -27,9 +27,9 @@ from pathlib import Path

 import soundfile

-from utils.utility import check_md5sum
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import check_md5sum
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip

 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')
--- a/dataset/voxforge/voxforge.py
+++ b/dataset/voxforge/voxforge.py
@ -28,9 +28,9 @@ import subprocess

 import soundfile

-from utils.utility import download_multi
-from utils.utility import getfile_insensitive
-from utils.utility import unpack
+from paddlespeech.dataset.download import download_multi
+from paddlespeech.dataset.download import getfile_insensitive
+from paddlespeech.dataset.download import unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

--- a/demos/audio_searching/src/test_audio_search.py
+++ b/demos/audio_searching/src/test_audio_search.py
@ -14,8 +14,8 @@
 from audio_search import app
 from fastapi.testclient import TestClient

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 client = TestClient(app)

--- a/demos/audio_searching/src/test_vpr_search.py
+++ b/demos/audio_searching/src/test_vpr_search.py
@ -14,8 +14,8 @@
 from fastapi.testclient import TestClient
 from vpr_search import app

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 client = TestClient(app)

--- a/paddlespeech/init.py
+++ b/paddlespeech/init.py
@ -13,3 +13,7 @@
 # limitations under the License.
 import _locale
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
+
+__version__ = '0.0.0'
+
+__commit__ = '9cf8c1985a98bb380c183116123672976bdfe5c9'
--- a/paddlespeech/dataset/init.py
+++ b/paddlespeech/dataset/init.py
--- a/paddlespeech/dataset/aidatatang_200zh/README.md
+++ b/paddlespeech/dataset/aidatatang_200zh/README.md
--- a/paddlespeech/dataset/aidatatang_200zh/init.py
+++ b/paddlespeech/dataset/aidatatang_200zh/init.py
@ -0,0 +1,14 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .aidatatang_200zh import main as aidatatang_200zh_main
--- a/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py
@ -0,0 +1,157 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare aidatatang_200zh mandarin dataset
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import json
+import os
+from pathlib import Path
+
+import soundfile
+
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = 'http://www.openslr.org/resources/62'
+# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
+DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
+MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/aidatatang_200zh",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aidatatang_200_zh_transcript.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        del json_lines[:]
+        total_sec = 0.0
+        total_text = 0.0
+        total_num = 0
+
+        audio_dir = os.path.join(data_dir, 'corpus/', dtype)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                if not fname.endswith('.wav'):
+                    continue
+
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                utt2spk = Path(audio_path).parent.name
+
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
+                            'feat': audio_path,
+                            'feat_shape': (duration, ),  # second
+                            'text': text,
+                        },
+                        ensure_ascii=False))
+
+                total_sec += duration
+                total_text += len(text)
+                total_num += 1
+
+        manifest_path = manifest_path_prefix + '.' + dtype
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+
+        manifest_dir = os.path.dirname(manifest_path_prefix)
+        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
+        with open(meta_path, 'w') as f:
+            print(f"{dtype}:", file=f)
+            print(f"{total_num} utts", file=f)
+            print(f"{total_sec / (60*60)} h", file=f)
+            print(f"{total_text} text", file=f)
+            print(f"{total_text / total_sec} text/sec", file=f)
+            print(f"{total_sec / total_num} sec/utt", file=f)
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, subset)
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'corpus')
+        for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
+            for sub in dirlist:
+                print(f"unpack dir {sub}...")
+                for folder, _, filelist in sorted(
+                        os.walk(os.path.join(subfolder, sub))):
+                    for ftar in filelist:
+                        unpack(os.path.join(folder, ftar), folder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+
+    create_manifest(data_dir, manifest_path)
+
+
+def main():
+    print(f"args: {args}")
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix,
+        subset='aidatatang_200zh')
+
+    print("Data download and manifest prepare done!")
+
+
+if __name__ == '__main__':
+    main()
--- a/paddlespeech/dataset/aishell/README.md
+++ b/paddlespeech/dataset/aishell/README.md
@ -0,0 +1,58 @@
+# [Aishell1](http://openslr.elda.org/33/)
+
+This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
+
+
+## Dataset Architecture
+
+```bash
+data_aishell
+├── transcript      # text 目录
+└── wav             # wav 目录
+    ├── dev         # dev 目录
+    │   ├── S0724   # spk 目录
+    │   ├── S0725
+    │   ├── S0726
+    ├── train
+    │   ├── S0724
+    │   ├── S0725
+    │   ├── S0726
+    ├── test
+    │   ├── S0724
+    │   ├── S0725
+    │   ├── S0726
+ 
+
+data_aishell
+├── transcript
+│   └── aishell_transcript_v0.8.txt   # 文本标注文件
+└── wav
+    ├── dev
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav  # S0724 的音频
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    ├── test
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    ├── train
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    
+标注文件格式： <utt> <tokens>
+> head data_aishell/transcript/aishell_transcript_v0.8.txt 
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+BAC009S0002W0131 显示 出 了 极 强 的 威力
+```
--- a/paddlespeech/dataset/aishell/init.py
+++ b/paddlespeech/dataset/aishell/init.py
@ -0,0 +1,18 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .aishell import check_dataset
+from .aishell import create_manifest
+from .aishell import download_dataset
+from .aishell import main as aishell_main
+from .aishell import prepare_dataset
--- a/paddlespeech/dataset/aishell/aishell.py
+++ b/paddlespeech/dataset/aishell/aishell.py
@ -0,0 +1,229 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare Aishell mandarin dataset
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import json
+import os
+from pathlib import Path
+
+import soundfile
+
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = 'http://openslr.elda.org/resources/33'
+# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
+DATA_URL = URL_ROOT + '/data_aishell.tgz'
+MD5_DATA = '2f494334227864a8a8fec932999db9d8'
+RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
+MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/Aishell",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % os.path.join(data_dir,
+                                                    manifest_path_prefix))
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    data_metas = dict()
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        del json_lines[:]
+        total_sec = 0.0
+        total_text = 0.0
+        total_num = 0
+
+        audio_dir = os.path.join(data_dir, 'wav', dtype)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    continue
+
+                utt2spk = Path(audio_path).parent.name
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
+                            'feat': audio_path,
+                            'feat_shape': (duration, ),  # second
+                            'text': text
+                        },
+                        ensure_ascii=False))
+
+                total_sec += duration
+                total_text += len(text)
+                total_num += 1
+
+        manifest_path = manifest_path_prefix + '.' + dtype
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+
+        meta = dict()
+        meta["dtype"] = dtype  # train, dev, test
+        meta["utts"] = total_num
+        meta["hours"] = total_sec / (60 * 60)
+        meta["text"] = total_text
+        meta["text/sec"] = total_text / total_sec
+        meta["sec/utt"] = total_sec / total_num
+        data_metas[dtype] = meta
+
+        manifest_dir = os.path.dirname(manifest_path_prefix)
+        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
+        with open(meta_path, 'w') as f:
+            for key, val in meta.items():
+                print(f"{key}: {val}", file=f)
+
+    return data_metas
+
+
+def download_dataset(url, md5sum, target_dir):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, 'data_aishell')
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'wav')
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for ftar in filelist:
+                unpack(os.path.join(subfolder, ftar), subfolder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              os.path.abspath(target_dir))
+    return os.path.abspath(data_dir)
+
+
+def check_dataset(data_dir):
+    print(f"check dataset {os.path.abspath(data_dir)} ...")
+
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    if not os.path.exists(transcript_path):
+        raise FileNotFoundError(f"no transcript file found in {data_dir}.")
+
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    no_label = 0
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        audio_dir = os.path.join(data_dir, 'wav', dtype)
+        if not os.path.exists(audio_dir):
+            raise IOError(f"{audio_dir} does not exist.")
+
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    print(f"Warning: {audio_id} not has transcript.")
+                    no_label += 1
+                    continue
+
+                utt2spk = Path(audio_path).parent.name
+                audio_data, samplerate = soundfile.read(audio_path)
+                assert samplerate == 16000, f"{audio_path} sample rate is {samplerate} not 16k, please check."
+
+        print(f"Warning: {dtype} has {no_label} audio does not has transcript.")
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path=None, check=False):
+    """Download, unpack and create manifest file."""
+    data_dir = download_dataset(url, md5sum, target_dir)
+
+    if check:
+        try:
+            check_dataset(data_dir)
+        except Exception as e:
+            raise ValueError(
+                f"{data_dir} dataset format not right, please check it.")
+
+    meta = None
+    if manifest_path:
+        meta = create_manifest(data_dir, manifest_path)
+
+    return data_dir, meta
+
+
+def main():
+    print(f"args: {args}")
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    data_dir, meta = prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix,
+        check=True)
+
+    resource_dir, _ = prepare_dataset(
+        url=RESOURCE_URL,
+        md5sum=MD5_RESOURCE,
+        target_dir=args.target_dir,
+        manifest_path=None)
+
+    print("Data download and manifest prepare done!")
+
+
+if __name__ == '__main__':
+    main()
--- a/paddlespeech/dataset/download.py
+++ b/paddlespeech/dataset/download.py
@ -19,91 +19,16 @@ import zipfile
 from typing import Text

 __all__ = [
-    "check_md5sum", "getfile_insensitive", "download_multi", "download",
-    "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
-    "get_commandline_args"
+    "check_md5sum",
+    "getfile_insensitive",
+    "download_multi",
+    "download",
+    "unpack",
+    "unzip",
+    "md5file",
 ]


-def get_commandline_args():
-    extra_chars = [
-        " ",
-        ";",
-        "&",
-        "(",
-        ")",
-        "|",
-        "^",
-        "<",
-        ">",
-        "?",
-        "*",
-        "[",
-        "]",
-        "$",
-        "`",
-        '"',
-        "\\",
-        "!",
-        "{",
-        "}",
-    ]
-
-    # Escape the extra characters for shell
-    argv = [
-        arg.replace("'", "'\\''") if all(char not in arg
-                                         for char in extra_chars) else
-        "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
-    ]
-
-    return sys.executable + " " + " ".join(argv)
-
-
-def print_arguments(args, info=None):
-    """Print argparse's arguments.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        parser.add_argument("name", default="Jonh", type=str, help="User name.")
-        args = parser.parse_args()
-        print_arguments(args)
-
-    :param args: Input argparse.Namespace for printing.
-    :type args: argparse.Namespace
-    """
-    filename = ""
-    if info:
-        filename = info["__file__"]
-    filename = os.path.basename(filename)
-    print(f"----------- {filename} Configuration Arguments -----------")
-    for arg, value in sorted(vars(args).items()):
-        print("%s: %s" % (arg, value))
-    print("-----------------------------------------------------------")
-
-
-def add_arguments(argname, type, default, help, argparser, **kwargs):
-    """Add argparse's argument.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        add_argument("name", str, "Jonh", "User name.", parser)
-        args = parser.parse_args()
-    """
-    type = distutils.util.strtobool if type == bool else type
-    argparser.add_argument(
-        "--" + argname,
-        default=default,
-        type=type,
-        help=help + ' Default: %(default)s.',
-        **kwargs)
-
-
 def md5file(fname):
    hash_md5 = hashlib.md5()
    f = open(fname, "rb")
--- a/paddlespeech/utils/argparse.py
+++ b/paddlespeech/utils/argparse.py
@ -0,0 +1,98 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import hashlib
+import os
+import sys
+from typing import Text
+
+__all__ = ["print_arguments", "add_arguments", "get_commandline_args"]
+
+
+def get_commandline_args():
+    extra_chars = [
+        " ",
+        ";",
+        "&",
+        "(",
+        ")",
+        "|",
+        "^",
+        "<",
+        ">",
+        "?",
+        "*",
+        "[",
+        "]",
+        "$",
+        "`",
+        '"',
+        "\\",
+        "!",
+        "{",
+        "}",
+    ]
+
+    # Escape the extra characters for shell
+    argv = [
+        arg.replace("'", "'\\''") if all(char not in arg
+                                         for char in extra_chars) else
+        "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
+    ]
+
+    return sys.executable + " " + " ".join(argv)
+
+
+def print_arguments(args, info=None):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    filename = ""
+    if info:
+        filename = info["__file__"]
+    filename = os.path.basename(filename)
+    print(f"----------- {filename} Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).items()):
+        print("%s: %s" % (arg, value))
+    print("-----------------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
--- a/tests/test_tipc/conformer/scripts/aishell_tiny.py
+++ b/tests/test_tipc/conformer/scripts/aishell_tiny.py
@ -26,8 +26,8 @@ from pathlib import Path

 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

--- a/tests/unit/cli/aishell_test_prepare.py
+++ b/tests/unit/cli/aishell_test_prepare.py
@ -25,8 +25,8 @@ from pathlib import Path

 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@ -6,8 +6,8 @@ from pathlib import Path

 import jsonlines

-from utils.utility import add_arguments
-from utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main(args):