[s2t] mv dataset into paddlespeech.dataset (#3183)

* mv dataset into paddlespeech.dataset * add aidatatang * fix import
2 years ago · 35d874c532
parent 3ad55a31e7
commit 35d874c532
27 changed files with 619 additions and 387 deletions
--- a/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/dataset/aidatatang_200zh/aidatatang_200zh.py
@ -18,139 +18,7 @@ Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
+from paddlespeech.dataset.aidatatang_200zh import aidatatang_200zh_main
 import codecs
 import json
 import os
 from pathlib import Path
 import soundfile
 from utils.utility import download
 from utils.utility import unpack
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 URL_ROOT = 'http://www.openslr.org/resources/62'
 # URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
 DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
 MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/aidatatang_200zh",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
 args = parser.parse_args()
 def create_manifest(data_dir, manifest_path_prefix):
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []
    transcript_path = os.path.join(data_dir, 'transcript',
                                   'aidatatang_200_zh_transcript.txt')
    transcript_dict = {}
    for line in codecs.open(transcript_path, 'r', 'utf-8'):
        line = line.strip()
        if line == '':
            continue
        audio_id, text = line.split(' ', 1)
        # remove withespace, charactor text
        text = ''.join(text.split())
        transcript_dict[audio_id] = text
    data_types = ['train', 'dev', 'test']
    for dtype in data_types:
        del json_lines[:]
        total_sec = 0.0
        total_text = 0.0
        total_num = 0
        audio_dir = os.path.join(data_dir, 'corpus/', dtype)
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for fname in filelist:
                if not fname.endswith('.wav'):
                    continue
                audio_path = os.path.abspath(os.path.join(subfolder, fname))
                audio_id = os.path.basename(fname)[:-4]
                utt2spk = Path(audio_path).parent.name
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                text = transcript_dict[audio_id]
                json_lines.append(
                    json.dumps(
                        {
                            'utt': audio_id,
                            'utt2spk': str(utt2spk),
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': text,
                        },
                        ensure_ascii=False))
                total_sec += duration
                total_text += len(text)
                total_num += 1
        manifest_path = manifest_path_prefix + '.' + dtype
        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
            for line in json_lines:
                fout.write(line + '\n')
        manifest_dir = os.path.dirname(manifest_path_prefix)
        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
        with open(meta_path, 'w') as f:
            print(f"{dtype}:", file=f)
            print(f"{total_num} utts", file=f)
            print(f"{total_sec / (60*60)} h", file=f)
            print(f"{total_text} text", file=f)
            print(f"{total_text / total_sec} text/sec", file=f)
            print(f"{total_sec / total_num} sec/utt", file=f)
 def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
    """Download, unpack and create manifest file."""
    data_dir = os.path.join(target_dir, subset)
    if not os.path.exists(data_dir):
        filepath = download(url, md5sum, target_dir)
        unpack(filepath, target_dir)
        # unpack all audio tar files
        audio_dir = os.path.join(data_dir, 'corpus')
        for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
            for sub in dirlist:
                print(f"unpack dir {sub}...")
                for folder, _, filelist in sorted(
                        os.walk(os.path.join(subfolder, sub))):
                    for ftar in filelist:
                        unpack(os.path.join(folder, ftar), folder, True)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    create_manifest(data_dir, manifest_path)
 def main():
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(
        url=DATA_URL,
        md5sum=MD5_DATA,
        target_dir=args.target_dir,
        manifest_path=args.manifest_prefix,
        subset='aidatatang_200zh')
    print("Data download and manifest prepare done!")
 if __name__ == '__main__':
-    main()
+    aidatatang_200zh_main()
--- a/dataset/aishell/README.md
+++ b/dataset/aishell/README.md
@ -1,3 +0,0 @@
 # [Aishell1](http://openslr.elda.org/33/)
 This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
--- a/dataset/aishell/aishell.py
+++ b/dataset/aishell/aishell.py
@ -18,143 +18,7 @@ Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
+from paddlespeech.dataset.aishell import aishell_main
 import codecs
 import json
 import os
 from pathlib import Path
 import soundfile
 from utils.utility import download
 from utils.utility import unpack
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 URL_ROOT = 'http://openslr.elda.org/resources/33'
 # URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
 DATA_URL = URL_ROOT + '/data_aishell.tgz'
 MD5_DATA = '2f494334227864a8a8fec932999db9d8'
 RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
 MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/Aishell",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
 args = parser.parse_args()
 def create_manifest(data_dir, manifest_path_prefix):
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []
    transcript_path = os.path.join(data_dir, 'transcript',
                                   'aishell_transcript_v0.8.txt')
    transcript_dict = {}
    for line in codecs.open(transcript_path, 'r', 'utf-8'):
        line = line.strip()
        if line == '':
            continue
        audio_id, text = line.split(' ', 1)
        # remove withespace, charactor text
        text = ''.join(text.split())
        transcript_dict[audio_id] = text
    data_types = ['train', 'dev', 'test']
    for dtype in data_types:
        del json_lines[:]
        total_sec = 0.0
        total_text = 0.0
        total_num = 0
        audio_dir = os.path.join(data_dir, 'wav', dtype)
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for fname in filelist:
                audio_path = os.path.abspath(os.path.join(subfolder, fname))
                audio_id = os.path.basename(fname)[:-4]
                # if no transcription for audio then skipped
                if audio_id not in transcript_dict:
                    continue
                utt2spk = Path(audio_path).parent.name
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                text = transcript_dict[audio_id]
                json_lines.append(
                    json.dumps(
                        {
                            'utt': audio_id,
                            'utt2spk': str(utt2spk),
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': text
                        },
                        ensure_ascii=False))
                total_sec += duration
                total_text += len(text)
                total_num += 1
        manifest_path = manifest_path_prefix + '.' + dtype
        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
            for line in json_lines:
                fout.write(line + '\n')
        manifest_dir = os.path.dirname(manifest_path_prefix)
        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
        with open(meta_path, 'w') as f:
            print(f"{dtype}:", file=f)
            print(f"{total_num} utts", file=f)
            print(f"{total_sec / (60*60)} h", file=f)
            print(f"{total_text} text", file=f)
            print(f"{total_text / total_sec} text/sec", file=f)
            print(f"{total_sec / total_num} sec/utt", file=f)
 def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
    """Download, unpack and create manifest file."""
    data_dir = os.path.join(target_dir, 'data_aishell')
    if not os.path.exists(data_dir):
        filepath = download(url, md5sum, target_dir)
        unpack(filepath, target_dir)
        # unpack all audio tar files
        audio_dir = os.path.join(data_dir, 'wav')
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for ftar in filelist:
                unpack(os.path.join(subfolder, ftar), subfolder, True)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    if manifest_path:
        create_manifest(data_dir, manifest_path)
 def main():
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(
        url=DATA_URL,
        md5sum=MD5_DATA,
        target_dir=args.target_dir,
        manifest_path=args.manifest_prefix)
    prepare_dataset(
        url=RESOURCE_URL,
        md5sum=MD5_RESOURCE,
        target_dir=args.target_dir,
        manifest_path=None)
    print("Data download and manifest prepare done!")
 if __name__ == '__main__':
-    main()
+    aishell_main()
--- a/dataset/librispeech/librispeech.py
+++ b/dataset/librispeech/librispeech.py
@ -28,8 +28,8 @@ from multiprocessing.pool import Pool
 import distutils.util
 import soundfile
-from utils.utility import download
+from paddlespeech.dataset.download import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import unpack
 URL_ROOT = "http://openslr.elda.org/resources/12"
 #URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
--- a/dataset/mini_librispeech/mini_librispeech.py
+++ b/dataset/mini_librispeech/mini_librispeech.py
@ -27,8 +27,8 @@ from multiprocessing.pool import Pool
 import soundfile
-from utils.utility import download
+from paddlespeech.dataset.download import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import unpack
 URL_ROOT = "http://openslr.elda.org/resources/31"
 URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz"
--- a/dataset/musan/musan.py
+++ b/dataset/musan/musan.py
@ -29,8 +29,8 @@ import os
 import soundfile
-from utils.utility import download
+from paddlespeech.dataset.download import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import unpack
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
--- a/dataset/rir_noise/rir_noise.py
+++ b/dataset/rir_noise/rir_noise.py
@ -29,8 +29,8 @@ import os
 import soundfile
-from utils.utility import download
+from paddlespeech.dataset.download import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import unzip
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
--- a/dataset/thchs30/thchs30.py
+++ b/dataset/thchs30/thchs30.py
@ -27,8 +27,8 @@ from pathlib import Path
 import soundfile
-from utils.utility import download
+from paddlespeech.dataset.download import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import unpack
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
--- a/dataset/timit/timit.py
+++ b/dataset/timit/timit.py
@ -28,7 +28,7 @@ from pathlib import Path
 import soundfile
-from utils.utility import unzip
+from paddlespeech.dataset.download import unzip
 URL_ROOT = ""
 MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d"
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@ -31,9 +31,9 @@ from pathlib import Path
 import soundfile
-from utils.utility import check_md5sum
+from paddlespeech.dataset.download import check_md5sum
-from utils.utility import download
+from paddlespeech.dataset.download import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import unzip
 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')
--- a/dataset/voxceleb/voxceleb2.py
+++ b/dataset/voxceleb/voxceleb2.py
@ -27,9 +27,9 @@ from pathlib import Path
 import soundfile
-from utils.utility import check_md5sum
+from paddlespeech.dataset.download import check_md5sum
-from utils.utility import download
+from paddlespeech.dataset.download import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import unzip
 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')
--- a/dataset/voxforge/voxforge.py
+++ b/dataset/voxforge/voxforge.py
@ -28,9 +28,9 @@ import subprocess
 import soundfile
-from utils.utility import download_multi
+from paddlespeech.dataset.download import download_multi
-from utils.utility import getfile_insensitive
+from paddlespeech.dataset.download import getfile_insensitive
-from utils.utility import unpack
+from paddlespeech.dataset.download import unpack
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
--- a/demos/audio_searching/src/test_audio_search.py
+++ b/demos/audio_searching/src/test_audio_search.py
@ -14,8 +14,8 @@
 from audio_search import app
 from fastapi.testclient import TestClient
-from utils.utility import download
+from paddlespeech.dataset.download import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import unpack
 client = TestClient(app)
--- a/demos/audio_searching/src/test_vpr_search.py
+++ b/demos/audio_searching/src/test_vpr_search.py
@ -14,8 +14,8 @@
 from fastapi.testclient import TestClient
 from vpr_search import app
-from utils.utility import download
+from paddlespeech.dataset.download import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import unpack
 client = TestClient(app)
--- a/paddlespeech/init.py
+++ b/paddlespeech/init.py
@ -13,3 +13,7 @@
 # limitations under the License.
 import _locale
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
 __version__ = '0.0.0'
 __commit__ = '9cf8c1985a98bb380c183116123672976bdfe5c9'
--- a/paddlespeech/dataset/init.py
+++ b/paddlespeech/dataset/init.py
--- a/paddlespeech/dataset/aidatatang_200zh/README.md
+++ b/paddlespeech/dataset/aidatatang_200zh/README.md
--- a/paddlespeech/dataset/aidatatang_200zh/init.py
+++ b/paddlespeech/dataset/aidatatang_200zh/init.py
@ -0,0 +1,14 @@
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .aidatatang_200zh import main as aidatatang_200zh_main
--- a/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py
@ -0,0 +1,157 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Prepare aidatatang_200zh mandarin dataset
 Download, unpack and create manifest files.
 Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
 import argparse
 import codecs
 import json
 import os
 from pathlib import Path
 import soundfile
 from paddlespeech.dataset.download import download
 from paddlespeech.dataset.download import unpack
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 URL_ROOT = 'http://www.openslr.org/resources/62'
 # URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
 DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
 MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/aidatatang_200zh",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
 args = parser.parse_args()
 def create_manifest(data_dir, manifest_path_prefix):
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []
    transcript_path = os.path.join(data_dir, 'transcript',
                                   'aidatatang_200_zh_transcript.txt')
    transcript_dict = {}
    for line in codecs.open(transcript_path, 'r', 'utf-8'):
        line = line.strip()
        if line == '':
            continue
        audio_id, text = line.split(' ', 1)
        # remove withespace, charactor text
        text = ''.join(text.split())
        transcript_dict[audio_id] = text
    data_types = ['train', 'dev', 'test']
    for dtype in data_types:
        del json_lines[:]
        total_sec = 0.0
        total_text = 0.0
        total_num = 0
        audio_dir = os.path.join(data_dir, 'corpus/', dtype)
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for fname in filelist:
                if not fname.endswith('.wav'):
                    continue
                audio_path = os.path.abspath(os.path.join(subfolder, fname))
                audio_id = os.path.basename(fname)[:-4]
                utt2spk = Path(audio_path).parent.name
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                text = transcript_dict[audio_id]
                json_lines.append(
                    json.dumps(
                        {
                            'utt': audio_id,
                            'utt2spk': str(utt2spk),
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': text,
                        },
                        ensure_ascii=False))
                total_sec += duration
                total_text += len(text)
                total_num += 1
        manifest_path = manifest_path_prefix + '.' + dtype
        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
            for line in json_lines:
                fout.write(line + '\n')
        manifest_dir = os.path.dirname(manifest_path_prefix)
        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
        with open(meta_path, 'w') as f:
            print(f"{dtype}:", file=f)
            print(f"{total_num} utts", file=f)
            print(f"{total_sec / (60*60)} h", file=f)
            print(f"{total_text} text", file=f)
            print(f"{total_text / total_sec} text/sec", file=f)
            print(f"{total_sec / total_num} sec/utt", file=f)
 def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
    """Download, unpack and create manifest file."""
    data_dir = os.path.join(target_dir, subset)
    if not os.path.exists(data_dir):
        filepath = download(url, md5sum, target_dir)
        unpack(filepath, target_dir)
        # unpack all audio tar files
        audio_dir = os.path.join(data_dir, 'corpus')
        for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
            for sub in dirlist:
                print(f"unpack dir {sub}...")
                for folder, _, filelist in sorted(
                        os.walk(os.path.join(subfolder, sub))):
                    for ftar in filelist:
                        unpack(os.path.join(folder, ftar), folder, True)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    create_manifest(data_dir, manifest_path)
 def main():
    print(f"args: {args}")
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(
        url=DATA_URL,
        md5sum=MD5_DATA,
        target_dir=args.target_dir,
        manifest_path=args.manifest_prefix,
        subset='aidatatang_200zh')
    print("Data download and manifest prepare done!")
 if __name__ == '__main__':
    main()
--- a/paddlespeech/dataset/aishell/README.md
+++ b/paddlespeech/dataset/aishell/README.md
@ -0,0 +1,58 @@
 # [Aishell1](http://openslr.elda.org/33/)
 This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
 ## Dataset Architecture
 ```bash
 data_aishell
 ├── transcript      # text 目录
 └── wav             # wav 目录
    ├── dev         # dev 目录
    │   ├── S0724   # spk 目录
    │   ├── S0725
    │   ├── S0726
    ├── train
    │   ├── S0724
    │   ├── S0725
    │   ├── S0726
    ├── test
    │   ├── S0724
    │   ├── S0725
    │   ├── S0726
 data_aishell
 ├── transcript
 │   └── aishell_transcript_v0.8.txt   # 文本标注文件
 └── wav
    ├── dev
    │   ├── S0724
    │   │   ├── BAC009S0724W0121.wav  # S0724 的音频
    │   │   ├── BAC009S0724W0122.wav
    │   │   ├── BAC009S0724W0123.wav
    ├── test
    │   ├── S0724
    │   │   ├── BAC009S0724W0121.wav
    │   │   ├── BAC009S0724W0122.wav
    │   │   ├── BAC009S0724W0123.wav
    ├── train
    │   ├── S0724
    │   │   ├── BAC009S0724W0121.wav
    │   │   ├── BAC009S0724W0122.wav
    │   │   ├── BAC009S0724W0123.wav
 标注文件格式： <utt> <tokens>
 > head data_aishell/transcript/aishell_transcript_v0.8.txt 
 BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
 BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
 BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
 BAC009S0002W0125 各地 政府 便 纷纷 跟进
 BAC009S0002W0126 仅 一 个 多 月 的 时间 里
 BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
 BAC009S0002W0128 四十六 个 限 购 城市 当中
 BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
 BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
 BAC009S0002W0131 显示 出 了 极 强 的 威力
 ```
--- a/paddlespeech/dataset/aishell/init.py
+++ b/paddlespeech/dataset/aishell/init.py
@ -0,0 +1,18 @@
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .aishell import check_dataset
 from .aishell import create_manifest
 from .aishell import download_dataset
 from .aishell import main as aishell_main
 from .aishell import prepare_dataset
--- a/paddlespeech/dataset/aishell/aishell.py
+++ b/paddlespeech/dataset/aishell/aishell.py
@ -0,0 +1,229 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Prepare Aishell mandarin dataset
 Download, unpack and create manifest files.
 Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
 import argparse
 import codecs
 import json
 import os
 from pathlib import Path
 import soundfile
 from paddlespeech.dataset.download import download
 from paddlespeech.dataset.download import unpack
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 URL_ROOT = 'http://openslr.elda.org/resources/33'
 # URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
 DATA_URL = URL_ROOT + '/data_aishell.tgz'
 MD5_DATA = '2f494334227864a8a8fec932999db9d8'
 RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
 MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/Aishell",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
 args = parser.parse_args()
 def create_manifest(data_dir, manifest_path_prefix):
    print("Creating manifest %s ..." % os.path.join(data_dir,
                                                    manifest_path_prefix))
    json_lines = []
    transcript_path = os.path.join(data_dir, 'transcript',
                                   'aishell_transcript_v0.8.txt')
    transcript_dict = {}
    for line in codecs.open(transcript_path, 'r', 'utf-8'):
        line = line.strip()
        if line == '':
            continue
        audio_id, text = line.split(' ', 1)
        # remove withespace, charactor text
        text = ''.join(text.split())
        transcript_dict[audio_id] = text
    data_metas = dict()
    data_types = ['train', 'dev', 'test']
    for dtype in data_types:
        del json_lines[:]
        total_sec = 0.0
        total_text = 0.0
        total_num = 0
        audio_dir = os.path.join(data_dir, 'wav', dtype)
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for fname in filelist:
                audio_path = os.path.abspath(os.path.join(subfolder, fname))
                audio_id = os.path.basename(fname)[:-4]
                # if no transcription for audio then skipped
                if audio_id not in transcript_dict:
                    continue
                utt2spk = Path(audio_path).parent.name
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                text = transcript_dict[audio_id]
                json_lines.append(
                    json.dumps(
                        {
                            'utt': audio_id,
                            'utt2spk': str(utt2spk),
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': text
                        },
                        ensure_ascii=False))
                total_sec += duration
                total_text += len(text)
                total_num += 1
        manifest_path = manifest_path_prefix + '.' + dtype
        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
            for line in json_lines:
                fout.write(line + '\n')
        meta = dict()
        meta["dtype"] = dtype  # train, dev, test
        meta["utts"] = total_num
        meta["hours"] = total_sec / (60 * 60)
        meta["text"] = total_text
        meta["text/sec"] = total_text / total_sec
        meta["sec/utt"] = total_sec / total_num
        data_metas[dtype] = meta
        manifest_dir = os.path.dirname(manifest_path_prefix)
        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
        with open(meta_path, 'w') as f:
            for key, val in meta.items():
                print(f"{key}: {val}", file=f)
    return data_metas
 def download_dataset(url, md5sum, target_dir):
    """Download, unpack and create manifest file."""
    data_dir = os.path.join(target_dir, 'data_aishell')
    if not os.path.exists(data_dir):
        filepath = download(url, md5sum, target_dir)
        unpack(filepath, target_dir)
        # unpack all audio tar files
        audio_dir = os.path.join(data_dir, 'wav')
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for ftar in filelist:
                unpack(os.path.join(subfolder, ftar), subfolder, True)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              os.path.abspath(target_dir))
    return os.path.abspath(data_dir)
 def check_dataset(data_dir):
    print(f"check dataset {os.path.abspath(data_dir)} ...")
    transcript_path = os.path.join(data_dir, 'transcript',
                                   'aishell_transcript_v0.8.txt')
    if not os.path.exists(transcript_path):
        raise FileNotFoundError(f"no transcript file found in {data_dir}.")
    transcript_dict = {}
    for line in codecs.open(transcript_path, 'r', 'utf-8'):
        line = line.strip()
        if line == '':
            continue
        audio_id, text = line.split(' ', 1)
        # remove withespace, charactor text
        text = ''.join(text.split())
        transcript_dict[audio_id] = text
    no_label = 0
    data_types = ['train', 'dev', 'test']
    for dtype in data_types:
        audio_dir = os.path.join(data_dir, 'wav', dtype)
        if not os.path.exists(audio_dir):
            raise IOError(f"{audio_dir} does not exist.")
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for fname in filelist:
                audio_path = os.path.abspath(os.path.join(subfolder, fname))
                audio_id = os.path.basename(fname)[:-4]
                # if no transcription for audio then skipped
                if audio_id not in transcript_dict:
                    print(f"Warning: {audio_id} not has transcript.")
                    no_label += 1
                    continue
                utt2spk = Path(audio_path).parent.name
                audio_data, samplerate = soundfile.read(audio_path)
                assert samplerate == 16000, f"{audio_path} sample rate is {samplerate} not 16k, please check."
        print(f"Warning: {dtype} has {no_label} audio does not has transcript.")
 def prepare_dataset(url, md5sum, target_dir, manifest_path=None, check=False):
    """Download, unpack and create manifest file."""
    data_dir = download_dataset(url, md5sum, target_dir)
    if check:
        try:
            check_dataset(data_dir)
        except Exception as e:
            raise ValueError(
                f"{data_dir} dataset format not right, please check it.")
    meta = None
    if manifest_path:
        meta = create_manifest(data_dir, manifest_path)
    return data_dir, meta
 def main():
    print(f"args: {args}")
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)
    data_dir, meta = prepare_dataset(
        url=DATA_URL,
        md5sum=MD5_DATA,
        target_dir=args.target_dir,
        manifest_path=args.manifest_prefix,
        check=True)
    resource_dir, _ = prepare_dataset(
        url=RESOURCE_URL,
        md5sum=MD5_RESOURCE,
        target_dir=args.target_dir,
        manifest_path=None)
    print("Data download and manifest prepare done!")
 if __name__ == '__main__':
    main()
--- a/paddlespeech/dataset/download.py
+++ b/paddlespeech/dataset/download.py
@ -19,91 +19,16 @@ import zipfile
 from typing import Text
 __all__ = [
-    "check_md5sum", "getfile_insensitive", "download_multi", "download",
+    "check_md5sum",
-    "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
+    "getfile_insensitive",
-    "get_commandline_args"
+    "download_multi",
    "download",
    "unpack",
    "unzip",
    "md5file",
 ]
 def get_commandline_args():
    extra_chars = [
        " ",
        ";",
        "&",
        "(",
        ")",
        "|",
        "^",
        "<",
        ">",
        "?",
        "*",
        "[",
        "]",
        "$",
        "`",
        '"',
        "\\",
        "!",
        "{",
        "}",
    ]
    # Escape the extra characters for shell
    argv = [
        arg.replace("'", "'\\''") if all(char not in arg
                                         for char in extra_chars) else
        "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
    ]
    return sys.executable + " " + " ".join(argv)
 def print_arguments(args, info=None):
    """Print argparse's arguments.
    Usage:
    .. code-block:: python
        parser = argparse.ArgumentParser()
        parser.add_argument("name", default="Jonh", type=str, help="User name.")
        args = parser.parse_args()
        print_arguments(args)
    :param args: Input argparse.Namespace for printing.
    :type args: argparse.Namespace
    """
    filename = ""
    if info:
        filename = info["__file__"]
    filename = os.path.basename(filename)
    print(f"----------- {filename} Configuration Arguments -----------")
    for arg, value in sorted(vars(args).items()):
        print("%s: %s" % (arg, value))
    print("-----------------------------------------------------------")
 def add_arguments(argname, type, default, help, argparser, **kwargs):
    """Add argparse's argument.
    Usage:
    .. code-block:: python
        parser = argparse.ArgumentParser()
        add_argument("name", str, "Jonh", "User name.", parser)
        args = parser.parse_args()
    """
    type = distutils.util.strtobool if type == bool else type
    argparser.add_argument(
        "--" + argname,
        default=default,
        type=type,
        help=help + ' Default: %(default)s.',
        **kwargs)
 def md5file(fname):
    hash_md5 = hashlib.md5()
    f = open(fname, "rb")
--- a/paddlespeech/utils/argparse.py
+++ b/paddlespeech/utils/argparse.py
@ -0,0 +1,98 @@
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import hashlib
 import os
 import sys
 from typing import Text
 __all__ = ["print_arguments", "add_arguments", "get_commandline_args"]
 def get_commandline_args():
    extra_chars = [
        " ",
        ";",
        "&",
        "(",
        ")",
        "|",
        "^",
        "<",
        ">",
        "?",
        "*",
        "[",
        "]",
        "$",
        "`",
        '"',
        "\\",
        "!",
        "{",
        "}",
    ]
    # Escape the extra characters for shell
    argv = [
        arg.replace("'", "'\\''") if all(char not in arg
                                         for char in extra_chars) else
        "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
    ]
    return sys.executable + " " + " ".join(argv)
 def print_arguments(args, info=None):
    """Print argparse's arguments.
    Usage:
    .. code-block:: python
        parser = argparse.ArgumentParser()
        parser.add_argument("name", default="Jonh", type=str, help="User name.")
        args = parser.parse_args()
        print_arguments(args)
    :param args: Input argparse.Namespace for printing.
    :type args: argparse.Namespace
    """
    filename = ""
    if info:
        filename = info["__file__"]
    filename = os.path.basename(filename)
    print(f"----------- {filename} Configuration Arguments -----------")
    for arg, value in sorted(vars(args).items()):
        print("%s: %s" % (arg, value))
    print("-----------------------------------------------------------")
 def add_arguments(argname, type, default, help, argparser, **kwargs):
    """Add argparse's argument.
    Usage:
    .. code-block:: python
        parser = argparse.ArgumentParser()
        add_argument("name", str, "Jonh", "User name.", parser)
        args = parser.parse_args()
    """
    type = distutils.util.strtobool if type == bool else type
    argparser.add_argument(
        "--" + argname,
        default=default,
        type=type,
        help=help + ' Default: %(default)s.',
        **kwargs)
--- a/tests/test_tipc/conformer/scripts/aishell_tiny.py
+++ b/tests/test_tipc/conformer/scripts/aishell_tiny.py
@ -26,8 +26,8 @@ from pathlib import Path
 import soundfile
-from utils.utility import download
+from paddlespeech.dataset.download import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import unpack
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
--- a/tests/unit/cli/aishell_test_prepare.py
+++ b/tests/unit/cli/aishell_test_prepare.py
@ -25,8 +25,8 @@ from pathlib import Path
 import soundfile
-from utils.utility import download
+from paddlespeech.dataset.download import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import unpack
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@ -6,8 +6,8 @@ from pathlib import Path
 import jsonlines
-from utils.utility import add_arguments
+from paddlespeech.utils.argparse import add_arguments
-from utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 def main(args):
		`@ -1,3 +0,0 @@`
			`# [Aishell1](http://openslr.elda.org/33/)`

			This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )