PaddleSpeech/datasets/librispeech/pcloud_data.py

52 lines
1.5 KiB

import json
import os
import tarfile
import sys
import argparse
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--manifest_path",
default="/manifest.train",
type=str,
help="Manifest of target data. (default: %(default)s)")
parser.add_argument(
"--out_tar_path",
default="/dev.tar",
type=str,
help="Output tar file path. (default: %(default)s)")
parser.add_argument(
"--out_manifest_path",
default="/dev.mani",
type=str,
help="Manifest of output data. (default: %(default)s)")
args = parser.parse_args()
def gen_pcloud_data(manifest_path, out_tar_path, out_manifest_path):
'''
1. According manifest, tar sound files into out_tar_path
2. Generate a new manifest for output tar file
'''
out_tar = tarfile.open(out_tar_path, 'w')
manifest = []
for json_line in open(manifest_path):
try:
json_data = json.loads(json_line)
except Exception as e:
raise IOError("Error reading manifest: %s" % str(e))
sound_file = json_data['audio_filepath']
filename = os.path.basename(sound_file)
out_tar.add(sound_file, arcname=filename)
json_data['audio_filepath'] = filename
manifest.append("%s\n" % json.dumps(json_data))
with open(out_manifest_path, 'w') as out_manifest:
out_manifest.writelines(manifest)
out_manifest.close()
out_tar.close()
if __name__ == '__main__':
gen_pcloud_data(args.manifest_path, args.out_tar_path,
args.out_manifest_path)