From 35d874c5321e16eb57d8d9d77e7cbaec1ff3058d Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 21 Apr 2023 11:33:17 +0800 Subject: [PATCH] [s2t] mv dataset into paddlespeech.dataset (#3183) * mv dataset into paddlespeech.dataset * add aidatatang * fix import --- dataset/aidatatang_200zh/aidatatang_200zh.py | 136 +---------- dataset/aishell/README.md | 3 - dataset/aishell/aishell.py | 140 +---------- dataset/librispeech/librispeech.py | 4 +- dataset/mini_librispeech/mini_librispeech.py | 4 +- dataset/musan/musan.py | 4 +- dataset/rir_noise/rir_noise.py | 4 +- dataset/thchs30/thchs30.py | 4 +- dataset/timit/timit.py | 2 +- dataset/voxceleb/voxceleb1.py | 6 +- dataset/voxceleb/voxceleb2.py | 6 +- dataset/voxforge/voxforge.py | 6 +- .../audio_searching/src/test_audio_search.py | 4 +- demos/audio_searching/src/test_vpr_search.py | 4 +- paddlespeech/__init__.py | 4 + paddlespeech/dataset/__init__.py | 0 .../dataset}/aidatatang_200zh/README.md | 0 .../dataset/aidatatang_200zh/__init__.py | 14 ++ .../aidatatang_200zh/aidatatang_200zh.py | 157 ++++++++++++ paddlespeech/dataset/aishell/README.md | 58 +++++ paddlespeech/dataset/aishell/__init__.py | 18 ++ paddlespeech/dataset/aishell/aishell.py | 229 ++++++++++++++++++ .../dataset/download.py | 89 +------ paddlespeech/utils/argparse.py | 98 ++++++++ .../conformer/scripts/aishell_tiny.py | 4 +- tests/unit/cli/aishell_test_prepare.py | 4 +- utils/manifest_key_value.py | 4 +- 27 files changed, 619 insertions(+), 387 deletions(-) delete mode 100644 dataset/aishell/README.md create mode 100644 paddlespeech/dataset/__init__.py rename {dataset => paddlespeech/dataset}/aidatatang_200zh/README.md (100%) create mode 100644 paddlespeech/dataset/aidatatang_200zh/__init__.py create mode 100644 paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py create mode 100644 paddlespeech/dataset/aishell/README.md create mode 100644 paddlespeech/dataset/aishell/__init__.py create mode 100644 paddlespeech/dataset/aishell/aishell.py rename utils/utility.py => paddlespeech/dataset/download.py (59%) create mode 100644 paddlespeech/utils/argparse.py diff --git a/dataset/aidatatang_200zh/aidatatang_200zh.py b/dataset/aidatatang_200zh/aidatatang_200zh.py index 85f478c2..3b706c49 100644 --- a/dataset/aidatatang_200zh/aidatatang_200zh.py +++ b/dataset/aidatatang_200zh/aidatatang_200zh.py @@ -18,139 +18,7 @@ Manifest file is a json-format file with each line containing the meta data (i.e. audio filepath, transcript and audio duration) of each audio file in the data set. """ -import argparse -import codecs -import json -import os -from pathlib import Path - -import soundfile - -from utils.utility import download -from utils.utility import unpack - -DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') - -URL_ROOT = 'http://www.openslr.org/resources/62' -# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62' -DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz' -MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949' - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--target_dir", - default=DATA_HOME + "/aidatatang_200zh", - type=str, - help="Directory to save the dataset. (default: %(default)s)") -parser.add_argument( - "--manifest_prefix", - default="manifest", - type=str, - help="Filepath prefix for output manifests. (default: %(default)s)") -args = parser.parse_args() - - -def create_manifest(data_dir, manifest_path_prefix): - print("Creating manifest %s ..." % manifest_path_prefix) - json_lines = [] - transcript_path = os.path.join(data_dir, 'transcript', - 'aidatatang_200_zh_transcript.txt') - transcript_dict = {} - for line in codecs.open(transcript_path, 'r', 'utf-8'): - line = line.strip() - if line == '': - continue - audio_id, text = line.split(' ', 1) - # remove withespace, charactor text - text = ''.join(text.split()) - transcript_dict[audio_id] = text - - data_types = ['train', 'dev', 'test'] - for dtype in data_types: - del json_lines[:] - total_sec = 0.0 - total_text = 0.0 - total_num = 0 - - audio_dir = os.path.join(data_dir, 'corpus/', dtype) - for subfolder, _, filelist in sorted(os.walk(audio_dir)): - for fname in filelist: - if not fname.endswith('.wav'): - continue - - audio_path = os.path.abspath(os.path.join(subfolder, fname)) - audio_id = os.path.basename(fname)[:-4] - utt2spk = Path(audio_path).parent.name - - audio_data, samplerate = soundfile.read(audio_path) - duration = float(len(audio_data) / samplerate) - text = transcript_dict[audio_id] - json_lines.append( - json.dumps( - { - 'utt': audio_id, - 'utt2spk': str(utt2spk), - 'feat': audio_path, - 'feat_shape': (duration, ), # second - 'text': text, - }, - ensure_ascii=False)) - - total_sec += duration - total_text += len(text) - total_num += 1 - - manifest_path = manifest_path_prefix + '.' + dtype - with codecs.open(manifest_path, 'w', 'utf-8') as fout: - for line in json_lines: - fout.write(line + '\n') - - manifest_dir = os.path.dirname(manifest_path_prefix) - meta_path = os.path.join(manifest_dir, dtype) + '.meta' - with open(meta_path, 'w') as f: - print(f"{dtype}:", file=f) - print(f"{total_num} utts", file=f) - print(f"{total_sec / (60*60)} h", file=f) - print(f"{total_text} text", file=f) - print(f"{total_text / total_sec} text/sec", file=f) - print(f"{total_sec / total_num} sec/utt", file=f) - - -def prepare_dataset(url, md5sum, target_dir, manifest_path, subset): - """Download, unpack and create manifest file.""" - data_dir = os.path.join(target_dir, subset) - if not os.path.exists(data_dir): - filepath = download(url, md5sum, target_dir) - unpack(filepath, target_dir) - # unpack all audio tar files - audio_dir = os.path.join(data_dir, 'corpus') - for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)): - for sub in dirlist: - print(f"unpack dir {sub}...") - for folder, _, filelist in sorted( - os.walk(os.path.join(subfolder, sub))): - for ftar in filelist: - unpack(os.path.join(folder, ftar), folder, True) - else: - print("Skip downloading and unpacking. Data already exists in %s." % - target_dir) - - create_manifest(data_dir, manifest_path) - - -def main(): - if args.target_dir.startswith('~'): - args.target_dir = os.path.expanduser(args.target_dir) - - prepare_dataset( - url=DATA_URL, - md5sum=MD5_DATA, - target_dir=args.target_dir, - manifest_path=args.manifest_prefix, - subset='aidatatang_200zh') - - print("Data download and manifest prepare done!") - +from paddlespeech.dataset.aidatatang_200zh import aidatatang_200zh_main if __name__ == '__main__': - main() + aidatatang_200zh_main() diff --git a/dataset/aishell/README.md b/dataset/aishell/README.md deleted file mode 100644 index a7dd0cf3..00000000 --- a/dataset/aishell/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# [Aishell1](http://openslr.elda.org/33/) - -This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. ) diff --git a/dataset/aishell/aishell.py b/dataset/aishell/aishell.py index ec43104d..b3288757 100644 --- a/dataset/aishell/aishell.py +++ b/dataset/aishell/aishell.py @@ -18,143 +18,7 @@ Manifest file is a json-format file with each line containing the meta data (i.e. audio filepath, transcript and audio duration) of each audio file in the data set. """ -import argparse -import codecs -import json -import os -from pathlib import Path - -import soundfile - -from utils.utility import download -from utils.utility import unpack - -DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') - -URL_ROOT = 'http://openslr.elda.org/resources/33' -# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33' -DATA_URL = URL_ROOT + '/data_aishell.tgz' -MD5_DATA = '2f494334227864a8a8fec932999db9d8' -RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz' -MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5' - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--target_dir", - default=DATA_HOME + "/Aishell", - type=str, - help="Directory to save the dataset. (default: %(default)s)") -parser.add_argument( - "--manifest_prefix", - default="manifest", - type=str, - help="Filepath prefix for output manifests. (default: %(default)s)") -args = parser.parse_args() - - -def create_manifest(data_dir, manifest_path_prefix): - print("Creating manifest %s ..." % manifest_path_prefix) - json_lines = [] - transcript_path = os.path.join(data_dir, 'transcript', - 'aishell_transcript_v0.8.txt') - transcript_dict = {} - for line in codecs.open(transcript_path, 'r', 'utf-8'): - line = line.strip() - if line == '': - continue - audio_id, text = line.split(' ', 1) - # remove withespace, charactor text - text = ''.join(text.split()) - transcript_dict[audio_id] = text - - data_types = ['train', 'dev', 'test'] - for dtype in data_types: - del json_lines[:] - total_sec = 0.0 - total_text = 0.0 - total_num = 0 - - audio_dir = os.path.join(data_dir, 'wav', dtype) - for subfolder, _, filelist in sorted(os.walk(audio_dir)): - for fname in filelist: - audio_path = os.path.abspath(os.path.join(subfolder, fname)) - audio_id = os.path.basename(fname)[:-4] - # if no transcription for audio then skipped - if audio_id not in transcript_dict: - continue - - utt2spk = Path(audio_path).parent.name - audio_data, samplerate = soundfile.read(audio_path) - duration = float(len(audio_data) / samplerate) - text = transcript_dict[audio_id] - json_lines.append( - json.dumps( - { - 'utt': audio_id, - 'utt2spk': str(utt2spk), - 'feat': audio_path, - 'feat_shape': (duration, ), # second - 'text': text - }, - ensure_ascii=False)) - - total_sec += duration - total_text += len(text) - total_num += 1 - - manifest_path = manifest_path_prefix + '.' + dtype - with codecs.open(manifest_path, 'w', 'utf-8') as fout: - for line in json_lines: - fout.write(line + '\n') - - manifest_dir = os.path.dirname(manifest_path_prefix) - meta_path = os.path.join(manifest_dir, dtype) + '.meta' - with open(meta_path, 'w') as f: - print(f"{dtype}:", file=f) - print(f"{total_num} utts", file=f) - print(f"{total_sec / (60*60)} h", file=f) - print(f"{total_text} text", file=f) - print(f"{total_text / total_sec} text/sec", file=f) - print(f"{total_sec / total_num} sec/utt", file=f) - - -def prepare_dataset(url, md5sum, target_dir, manifest_path=None): - """Download, unpack and create manifest file.""" - data_dir = os.path.join(target_dir, 'data_aishell') - if not os.path.exists(data_dir): - filepath = download(url, md5sum, target_dir) - unpack(filepath, target_dir) - # unpack all audio tar files - audio_dir = os.path.join(data_dir, 'wav') - for subfolder, _, filelist in sorted(os.walk(audio_dir)): - for ftar in filelist: - unpack(os.path.join(subfolder, ftar), subfolder, True) - else: - print("Skip downloading and unpacking. Data already exists in %s." % - target_dir) - - if manifest_path: - create_manifest(data_dir, manifest_path) - - -def main(): - if args.target_dir.startswith('~'): - args.target_dir = os.path.expanduser(args.target_dir) - - prepare_dataset( - url=DATA_URL, - md5sum=MD5_DATA, - target_dir=args.target_dir, - manifest_path=args.manifest_prefix) - - prepare_dataset( - url=RESOURCE_URL, - md5sum=MD5_RESOURCE, - target_dir=args.target_dir, - manifest_path=None) - - print("Data download and manifest prepare done!") - +from paddlespeech.dataset.aishell import aishell_main if __name__ == '__main__': - main() + aishell_main() diff --git a/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py index 2d6f1763..44567b0c 100644 --- a/dataset/librispeech/librispeech.py +++ b/dataset/librispeech/librispeech.py @@ -28,8 +28,8 @@ from multiprocessing.pool import Pool import distutils.util import soundfile -from utils.utility import download -from utils.utility import unpack +from paddlespeech.dataset.download import download +from paddlespeech.dataset.download import unpack URL_ROOT = "http://openslr.elda.org/resources/12" #URL_ROOT = "https://openslr.magicdatatech.com/resources/12" diff --git a/dataset/mini_librispeech/mini_librispeech.py b/dataset/mini_librispeech/mini_librispeech.py index 0eb80bf8..24bd98d8 100644 --- a/dataset/mini_librispeech/mini_librispeech.py +++ b/dataset/mini_librispeech/mini_librispeech.py @@ -27,8 +27,8 @@ from multiprocessing.pool import Pool import soundfile -from utils.utility import download -from utils.utility import unpack +from paddlespeech.dataset.download import download +from paddlespeech.dataset.download import unpack URL_ROOT = "http://openslr.elda.org/resources/31" URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz" diff --git a/dataset/musan/musan.py b/dataset/musan/musan.py index ae3430b2..85d986e8 100644 --- a/dataset/musan/musan.py +++ b/dataset/musan/musan.py @@ -29,8 +29,8 @@ import os import soundfile -from utils.utility import download -from utils.utility import unpack +from paddlespeech.dataset.download import download +from paddlespeech.dataset.download import unpack DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') diff --git a/dataset/rir_noise/rir_noise.py b/dataset/rir_noise/rir_noise.py index b1d47558..b98dff72 100644 --- a/dataset/rir_noise/rir_noise.py +++ b/dataset/rir_noise/rir_noise.py @@ -29,8 +29,8 @@ import os import soundfile -from utils.utility import download -from utils.utility import unzip +from paddlespeech.dataset.download import download +from paddlespeech.dataset.download import unzip DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') diff --git a/dataset/thchs30/thchs30.py b/dataset/thchs30/thchs30.py index d41c0e17..c5c3eb7a 100644 --- a/dataset/thchs30/thchs30.py +++ b/dataset/thchs30/thchs30.py @@ -27,8 +27,8 @@ from pathlib import Path import soundfile -from utils.utility import download -from utils.utility import unpack +from paddlespeech.dataset.download import download +from paddlespeech.dataset.download import unpack DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') diff --git a/dataset/timit/timit.py b/dataset/timit/timit.py index c4a9f066..f3889d17 100644 --- a/dataset/timit/timit.py +++ b/dataset/timit/timit.py @@ -28,7 +28,7 @@ from pathlib import Path import soundfile -from utils.utility import unzip +from paddlespeech.dataset.download import unzip URL_ROOT = "" MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d" diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py index 95827f70..8d410067 100644 --- a/dataset/voxceleb/voxceleb1.py +++ b/dataset/voxceleb/voxceleb1.py @@ -31,9 +31,9 @@ from pathlib import Path import soundfile -from utils.utility import check_md5sum -from utils.utility import download -from utils.utility import unzip +from paddlespeech.dataset.download import check_md5sum +from paddlespeech.dataset.download import download +from paddlespeech.dataset.download import unzip # all the data will be download in the current data/voxceleb directory default DATA_HOME = os.path.expanduser('.') diff --git a/dataset/voxceleb/voxceleb2.py b/dataset/voxceleb/voxceleb2.py index fe9e8b9c..6df6d1f3 100644 --- a/dataset/voxceleb/voxceleb2.py +++ b/dataset/voxceleb/voxceleb2.py @@ -27,9 +27,9 @@ from pathlib import Path import soundfile -from utils.utility import check_md5sum -from utils.utility import download -from utils.utility import unzip +from paddlespeech.dataset.download import check_md5sum +from paddlespeech.dataset.download import download +from paddlespeech.dataset.download import unzip # all the data will be download in the current data/voxceleb directory default DATA_HOME = os.path.expanduser('.') diff --git a/dataset/voxforge/voxforge.py b/dataset/voxforge/voxforge.py index 373791bf..327d200b 100644 --- a/dataset/voxforge/voxforge.py +++ b/dataset/voxforge/voxforge.py @@ -28,9 +28,9 @@ import subprocess import soundfile -from utils.utility import download_multi -from utils.utility import getfile_insensitive -from utils.utility import unpack +from paddlespeech.dataset.download import download_multi +from paddlespeech.dataset.download import getfile_insensitive +from paddlespeech.dataset.download import unpack DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') diff --git a/demos/audio_searching/src/test_audio_search.py b/demos/audio_searching/src/test_audio_search.py index cb91e156..f9ea2929 100644 --- a/demos/audio_searching/src/test_audio_search.py +++ b/demos/audio_searching/src/test_audio_search.py @@ -14,8 +14,8 @@ from audio_search import app from fastapi.testclient import TestClient -from utils.utility import download -from utils.utility import unpack +from paddlespeech.dataset.download import download +from paddlespeech.dataset.download import unpack client = TestClient(app) diff --git a/demos/audio_searching/src/test_vpr_search.py b/demos/audio_searching/src/test_vpr_search.py index 298e12eb..cc795564 100644 --- a/demos/audio_searching/src/test_vpr_search.py +++ b/demos/audio_searching/src/test_vpr_search.py @@ -14,8 +14,8 @@ from fastapi.testclient import TestClient from vpr_search import app -from utils.utility import download -from utils.utility import unpack +from paddlespeech.dataset.download import download +from paddlespeech.dataset.download import unpack client = TestClient(app) diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py index 6c7e75c1..969d189f 100644 --- a/paddlespeech/__init__.py +++ b/paddlespeech/__init__.py @@ -13,3 +13,7 @@ # limitations under the License. import _locale _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8']) + +__version__ = '0.0.0' + +__commit__ = '9cf8c1985a98bb380c183116123672976bdfe5c9' diff --git a/paddlespeech/dataset/__init__.py b/paddlespeech/dataset/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dataset/aidatatang_200zh/README.md b/paddlespeech/dataset/aidatatang_200zh/README.md similarity index 100% rename from dataset/aidatatang_200zh/README.md rename to paddlespeech/dataset/aidatatang_200zh/README.md diff --git a/paddlespeech/dataset/aidatatang_200zh/__init__.py b/paddlespeech/dataset/aidatatang_200zh/__init__.py new file mode 100644 index 00000000..9146247d --- /dev/null +++ b/paddlespeech/dataset/aidatatang_200zh/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .aidatatang_200zh import main as aidatatang_200zh_main diff --git a/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py b/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py new file mode 100644 index 00000000..ba178567 --- /dev/null +++ b/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py @@ -0,0 +1,157 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare aidatatang_200zh mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import json +import os +from pathlib import Path + +import soundfile + +from paddlespeech.dataset.download import download +from paddlespeech.dataset.download import unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'http://www.openslr.org/resources/62' +# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62' +DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz' +MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/aidatatang_200zh", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + transcript_path = os.path.join(data_dir, 'transcript', + 'aidatatang_200_zh_transcript.txt') + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': + continue + audio_id, text = line.split(' ', 1) + # remove withespace, charactor text + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + data_types = ['train', 'dev', 'test'] + for dtype in data_types: + del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + + audio_dir = os.path.join(data_dir, 'corpus/', dtype) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + if not fname.endswith('.wav'): + continue + + audio_path = os.path.abspath(os.path.join(subfolder, fname)) + audio_id = os.path.basename(fname)[:-4] + utt2spk = Path(audio_path).parent.name + + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + text = transcript_dict[audio_id] + json_lines.append( + json.dumps( + { + 'utt': audio_id, + 'utt2spk': str(utt2spk), + 'feat': audio_path, + 'feat_shape': (duration, ), # second + 'text': text, + }, + ensure_ascii=False)) + + total_sec += duration + total_text += len(text) + total_num += 1 + + manifest_path = manifest_path_prefix + '.' + dtype + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + manifest_dir = os.path.dirname(manifest_path_prefix) + meta_path = os.path.join(manifest_dir, dtype) + '.meta' + with open(meta_path, 'w') as f: + print(f"{dtype}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + + +def prepare_dataset(url, md5sum, target_dir, manifest_path, subset): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, subset) + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + # unpack all audio tar files + audio_dir = os.path.join(data_dir, 'corpus') + for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)): + for sub in dirlist: + print(f"unpack dir {sub}...") + for folder, _, filelist in sorted( + os.walk(os.path.join(subfolder, sub))): + for ftar in filelist: + unpack(os.path.join(folder, ftar), folder, True) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + + create_manifest(data_dir, manifest_path) + + +def main(): + print(f"args: {args}") + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix, + subset='aidatatang_200zh') + + print("Data download and manifest prepare done!") + + +if __name__ == '__main__': + main() diff --git a/paddlespeech/dataset/aishell/README.md b/paddlespeech/dataset/aishell/README.md new file mode 100644 index 00000000..c46312df --- /dev/null +++ b/paddlespeech/dataset/aishell/README.md @@ -0,0 +1,58 @@ +# [Aishell1](http://openslr.elda.org/33/) + +This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. ) + + +## Dataset Architecture + +```bash +data_aishell +├── transcript # text 目录 +└── wav # wav 目录 + ├── dev # dev 目录 + │ ├── S0724 # spk 目录 + │ ├── S0725 + │ ├── S0726 + ├── train + │ ├── S0724 + │ ├── S0725 + │ ├── S0726 + ├── test + │ ├── S0724 + │ ├── S0725 + │ ├── S0726 + + +data_aishell +├── transcript +│ └── aishell_transcript_v0.8.txt # 文本标注文件 +└── wav + ├── dev + │ ├── S0724 + │ │ ├── BAC009S0724W0121.wav # S0724 的音频 + │ │ ├── BAC009S0724W0122.wav + │ │ ├── BAC009S0724W0123.wav + ├── test + │ ├── S0724 + │ │ ├── BAC009S0724W0121.wav + │ │ ├── BAC009S0724W0122.wav + │ │ ├── BAC009S0724W0123.wav + ├── train + │ ├── S0724 + │ │ ├── BAC009S0724W0121.wav + │ │ ├── BAC009S0724W0122.wav + │ │ ├── BAC009S0724W0123.wav + +标注文件格式: +> head data_aishell/transcript/aishell_transcript_v0.8.txt +BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购 +BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉 +BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 +BAC009S0002W0125 各地 政府 便 纷纷 跟进 +BAC009S0002W0126 仅 一 个 多 月 的 时间 里 +BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 +BAC009S0002W0128 四十六 个 限 购 城市 当中 +BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购 +BAC009S0002W0130 财政 金融 政策 紧随 其后 而来 +BAC009S0002W0131 显示 出 了 极 强 的 威力 +``` diff --git a/paddlespeech/dataset/aishell/__init__.py b/paddlespeech/dataset/aishell/__init__.py new file mode 100644 index 00000000..667680af --- /dev/null +++ b/paddlespeech/dataset/aishell/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .aishell import check_dataset +from .aishell import create_manifest +from .aishell import download_dataset +from .aishell import main as aishell_main +from .aishell import prepare_dataset diff --git a/paddlespeech/dataset/aishell/aishell.py b/paddlespeech/dataset/aishell/aishell.py new file mode 100644 index 00000000..fa90aa67 --- /dev/null +++ b/paddlespeech/dataset/aishell/aishell.py @@ -0,0 +1,229 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Aishell mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import json +import os +from pathlib import Path + +import soundfile + +from paddlespeech.dataset.download import download +from paddlespeech.dataset.download import unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'http://openslr.elda.org/resources/33' +# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33' +DATA_URL = URL_ROOT + '/data_aishell.tgz' +MD5_DATA = '2f494334227864a8a8fec932999db9d8' +RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz' +MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/Aishell", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % os.path.join(data_dir, + manifest_path_prefix)) + json_lines = [] + transcript_path = os.path.join(data_dir, 'transcript', + 'aishell_transcript_v0.8.txt') + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': + continue + audio_id, text = line.split(' ', 1) + # remove withespace, charactor text + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + data_metas = dict() + data_types = ['train', 'dev', 'test'] + for dtype in data_types: + del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + + audio_dir = os.path.join(data_dir, 'wav', dtype) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + audio_path = os.path.abspath(os.path.join(subfolder, fname)) + audio_id = os.path.basename(fname)[:-4] + # if no transcription for audio then skipped + if audio_id not in transcript_dict: + continue + + utt2spk = Path(audio_path).parent.name + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + text = transcript_dict[audio_id] + json_lines.append( + json.dumps( + { + 'utt': audio_id, + 'utt2spk': str(utt2spk), + 'feat': audio_path, + 'feat_shape': (duration, ), # second + 'text': text + }, + ensure_ascii=False)) + + total_sec += duration + total_text += len(text) + total_num += 1 + + manifest_path = manifest_path_prefix + '.' + dtype + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + meta = dict() + meta["dtype"] = dtype # train, dev, test + meta["utts"] = total_num + meta["hours"] = total_sec / (60 * 60) + meta["text"] = total_text + meta["text/sec"] = total_text / total_sec + meta["sec/utt"] = total_sec / total_num + data_metas[dtype] = meta + + manifest_dir = os.path.dirname(manifest_path_prefix) + meta_path = os.path.join(manifest_dir, dtype) + '.meta' + with open(meta_path, 'w') as f: + for key, val in meta.items(): + print(f"{key}: {val}", file=f) + + return data_metas + + +def download_dataset(url, md5sum, target_dir): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, 'data_aishell') + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + # unpack all audio tar files + audio_dir = os.path.join(data_dir, 'wav') + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for ftar in filelist: + unpack(os.path.join(subfolder, ftar), subfolder, True) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + os.path.abspath(target_dir)) + return os.path.abspath(data_dir) + + +def check_dataset(data_dir): + print(f"check dataset {os.path.abspath(data_dir)} ...") + + transcript_path = os.path.join(data_dir, 'transcript', + 'aishell_transcript_v0.8.txt') + if not os.path.exists(transcript_path): + raise FileNotFoundError(f"no transcript file found in {data_dir}.") + + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': + continue + audio_id, text = line.split(' ', 1) + # remove withespace, charactor text + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + no_label = 0 + data_types = ['train', 'dev', 'test'] + for dtype in data_types: + audio_dir = os.path.join(data_dir, 'wav', dtype) + if not os.path.exists(audio_dir): + raise IOError(f"{audio_dir} does not exist.") + + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + audio_path = os.path.abspath(os.path.join(subfolder, fname)) + audio_id = os.path.basename(fname)[:-4] + # if no transcription for audio then skipped + if audio_id not in transcript_dict: + print(f"Warning: {audio_id} not has transcript.") + no_label += 1 + continue + + utt2spk = Path(audio_path).parent.name + audio_data, samplerate = soundfile.read(audio_path) + assert samplerate == 16000, f"{audio_path} sample rate is {samplerate} not 16k, please check." + + print(f"Warning: {dtype} has {no_label} audio does not has transcript.") + + +def prepare_dataset(url, md5sum, target_dir, manifest_path=None, check=False): + """Download, unpack and create manifest file.""" + data_dir = download_dataset(url, md5sum, target_dir) + + if check: + try: + check_dataset(data_dir) + except Exception as e: + raise ValueError( + f"{data_dir} dataset format not right, please check it.") + + meta = None + if manifest_path: + meta = create_manifest(data_dir, manifest_path) + + return data_dir, meta + + +def main(): + print(f"args: {args}") + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + data_dir, meta = prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix, + check=True) + + resource_dir, _ = prepare_dataset( + url=RESOURCE_URL, + md5sum=MD5_RESOURCE, + target_dir=args.target_dir, + manifest_path=None) + + print("Data download and manifest prepare done!") + + +if __name__ == '__main__': + main() diff --git a/utils/utility.py b/paddlespeech/dataset/download.py similarity index 59% rename from utils/utility.py rename to paddlespeech/dataset/download.py index dbf8b1d7..28dbd0eb 100755 --- a/utils/utility.py +++ b/paddlespeech/dataset/download.py @@ -19,91 +19,16 @@ import zipfile from typing import Text __all__ = [ - "check_md5sum", "getfile_insensitive", "download_multi", "download", - "unpack", "unzip", "md5file", "print_arguments", "add_arguments", - "get_commandline_args" + "check_md5sum", + "getfile_insensitive", + "download_multi", + "download", + "unpack", + "unzip", + "md5file", ] -def get_commandline_args(): - extra_chars = [ - " ", - ";", - "&", - "(", - ")", - "|", - "^", - "<", - ">", - "?", - "*", - "[", - "]", - "$", - "`", - '"', - "\\", - "!", - "{", - "}", - ] - - # Escape the extra characters for shell - argv = [ - arg.replace("'", "'\\''") if all(char not in arg - for char in extra_chars) else - "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv - ] - - return sys.executable + " " + " ".join(argv) - - -def print_arguments(args, info=None): - """Print argparse's arguments. - - Usage: - - .. code-block:: python - - parser = argparse.ArgumentParser() - parser.add_argument("name", default="Jonh", type=str, help="User name.") - args = parser.parse_args() - print_arguments(args) - - :param args: Input argparse.Namespace for printing. - :type args: argparse.Namespace - """ - filename = "" - if info: - filename = info["__file__"] - filename = os.path.basename(filename) - print(f"----------- {filename} Configuration Arguments -----------") - for arg, value in sorted(vars(args).items()): - print("%s: %s" % (arg, value)) - print("-----------------------------------------------------------") - - -def add_arguments(argname, type, default, help, argparser, **kwargs): - """Add argparse's argument. - - Usage: - - .. code-block:: python - - parser = argparse.ArgumentParser() - add_argument("name", str, "Jonh", "User name.", parser) - args = parser.parse_args() - """ - type = distutils.util.strtobool if type == bool else type - argparser.add_argument( - "--" + argname, - default=default, - type=type, - help=help + ' Default: %(default)s.', - **kwargs) - - def md5file(fname): hash_md5 = hashlib.md5() f = open(fname, "rb") diff --git a/paddlespeech/utils/argparse.py b/paddlespeech/utils/argparse.py new file mode 100644 index 00000000..4df75c5a --- /dev/null +++ b/paddlespeech/utils/argparse.py @@ -0,0 +1,98 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import hashlib +import os +import sys +from typing import Text + +__all__ = ["print_arguments", "add_arguments", "get_commandline_args"] + + +def get_commandline_args(): + extra_chars = [ + " ", + ";", + "&", + "(", + ")", + "|", + "^", + "<", + ">", + "?", + "*", + "[", + "]", + "$", + "`", + '"', + "\\", + "!", + "{", + "}", + ] + + # Escape the extra characters for shell + argv = [ + arg.replace("'", "'\\''") if all(char not in arg + for char in extra_chars) else + "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv + ] + + return sys.executable + " " + " ".join(argv) + + +def print_arguments(args, info=None): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + filename = "" + if info: + filename = info["__file__"] + filename = os.path.basename(filename) + print(f"----------- {filename} Configuration Arguments -----------") + for arg, value in sorted(vars(args).items()): + print("%s: %s" % (arg, value)) + print("-----------------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) diff --git a/tests/test_tipc/conformer/scripts/aishell_tiny.py b/tests/test_tipc/conformer/scripts/aishell_tiny.py index 14f09f17..c87463b5 100644 --- a/tests/test_tipc/conformer/scripts/aishell_tiny.py +++ b/tests/test_tipc/conformer/scripts/aishell_tiny.py @@ -26,8 +26,8 @@ from pathlib import Path import soundfile -from utils.utility import download -from utils.utility import unpack +from paddlespeech.dataset.download import download +from paddlespeech.dataset.download import unpack DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') diff --git a/tests/unit/cli/aishell_test_prepare.py b/tests/unit/cli/aishell_test_prepare.py index ed542c57..c364e4fd 100644 --- a/tests/unit/cli/aishell_test_prepare.py +++ b/tests/unit/cli/aishell_test_prepare.py @@ -25,8 +25,8 @@ from pathlib import Path import soundfile -from utils.utility import download -from utils.utility import unpack +from paddlespeech.dataset.download import download +from paddlespeech.dataset.download import unpack DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py index 3825fb9b..5ffe8e55 100755 --- a/utils/manifest_key_value.py +++ b/utils/manifest_key_value.py @@ -6,8 +6,8 @@ from pathlib import Path import jsonlines -from utils.utility import add_arguments -from utils.utility import print_arguments +from paddlespeech.utils.argparse import add_arguments +from paddlespeech.utils.argparse import print_arguments def main(args):