From e106f243b4f765fad466cc0608ba5b1240e2050c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 29 Jun 2021 12:13:04 +0000 Subject: [PATCH] dump dataset metadata --- examples/dataset/thchs30/thchs30.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py index 5613d768..c28fa56f 100644 --- a/examples/dataset/thchs30/thchs30.py +++ b/examples/dataset/thchs30/thchs30.py @@ -96,6 +96,10 @@ def create_manifest(data_dir, manifest_path_prefix): data_types = ['train', 'dev', 'test'] for dtype in data_types: del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + audio_dir = os.path.join(data_dir, dtype) for subfolder, _, filelist in sorted(os.walk(audio_dir)): for fname in filelist: @@ -125,11 +129,23 @@ def create_manifest(data_dir, manifest_path_prefix): }, ensure_ascii=False)) + total_sec += duration + total_text += len(text) + total_num += 1 + manifest_path = manifest_path_prefix + '.' + dtype with codecs.open(manifest_path, 'w', 'utf-8') as fout: for line in json_lines: fout.write(line + '\n') + with open(dtype + '.meta', 'w') as f: + print(f"{dtype}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + def prepare_dataset(url, md5sum, target_dir, manifest_path, subset): """Download, unpack and create manifest file."""