"""This tool is used for splitting data into each node of paddlecloud. This script should be called in paddlecloud. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import json import argparse parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--in_manifest_path", type=str, required=True, help="Input manifest path for all nodes.") parser.add_argument( "--out_manifest_path", type=str, required=True, help="Output manifest file path for current node.") args = parser.parse_args() def split_data(in_manifest_path, out_manifest_path): with open("/trainer_id", "r") as f: trainer_id = int(f.readline()[:-1]) with open("/trainer_count", "r") as f: trainer_count = int(f.readline()[:-1]) out_manifest = [] for index, json_line in enumerate(open(in_manifest_path, 'r')): if (index % trainer_count) == trainer_id: out_manifest.append("%s\n" % json_line.strip()) with open(out_manifest_path, 'w') as f: f.writelines(out_manifest) if __name__ == '__main__': split_data(args.in_manifest_path, args.out_manifest_path)