From 9473b8468c2c979f69017be4eff9df80edc16c31 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Wed, 24 Aug 2022 14:42:31 +0800 Subject: [PATCH] Create preprocess.py If there are no spaces between sentences in your text file, use this file to generate a new file, which adds spaces between each token. --- examples/iwslt2012/punc0/local/preprocess.py | 27 ++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 examples/iwslt2012/punc0/local/preprocess.py diff --git a/examples/iwslt2012/punc0/local/preprocess.py b/examples/iwslt2012/punc0/local/preprocess.py new file mode 100644 index 00000000..03b27e89 --- /dev/null +++ b/examples/iwslt2012/punc0/local/preprocess.py @@ -0,0 +1,27 @@ +import argparse +import os + +def process_sentence(line): + if line == '': return '' + res = line[0] + for i in range(1, len(line)): + res += (' ' + line[i]) + return res + +if __name__ == "__main__": + paser = argparse.ArgumentParser(description = "Input filename") + paser.add_argument('-input_file') + paser.add_argument('-output_file') + sentence_cnt = 0 + args = paser.parse_args() + with open(args.input_file, 'r') as f: + with open(args.output_file, 'w') as write_f: + while True: + line = f.readline() + if line: + sentence_cnt += 1 + write_f.write(process_sentence(line)) + else: + break + print('preprocess over') + print('total sentences number:', sentence_cnt)