From 2aef6958de17ce0c121c92bc1d9d63ba2d79edff Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Wed, 24 Aug 2022 11:23:27 +0800 Subject: [PATCH 1/3] Create preprocess.py If there are no spaces between sentences in your text file, use this file to generate a new file, which adds spaces between each token. --- examples/iwslt2012/punc0/preprocess.py | 27 ++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 examples/iwslt2012/punc0/preprocess.py diff --git a/examples/iwslt2012/punc0/preprocess.py b/examples/iwslt2012/punc0/preprocess.py new file mode 100644 index 000000000..c6645bdb5 --- /dev/null +++ b/examples/iwslt2012/punc0/preprocess.py @@ -0,0 +1,27 @@ +import argparse +import os + +def process_sentence(line): + if line == '': return '' + res = line[0] + for i in range(1, len(line)): + res += (' ' + line[i]) + return res + +if __name__ == "__main__": + paser = argparse.ArgumentParser(description = "Input filename") + paser.add_argument('-input_file') + paser.add_argument('-output_file') + sentence_cnt = 0 + args = paser.parse_args() + with open(args.input_file, 'r') as f: + with open(args.output_file, 'w') as write_f: + while True: + line = f.readline() + if line: + sentence_cnt += 1 + write_f.write(process_sentence(line)) + else: + break + print('preprocess over') + print('total sentences number:', sentence_cnt) From d2f7362aa718cab9378f18b31f0fcb5e924a232a Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Wed, 24 Aug 2022 14:40:39 +0800 Subject: [PATCH 2/3] Delete preprocess.py --- examples/iwslt2012/punc0/preprocess.py | 27 -------------------------- 1 file changed, 27 deletions(-) delete mode 100644 examples/iwslt2012/punc0/preprocess.py diff --git a/examples/iwslt2012/punc0/preprocess.py b/examples/iwslt2012/punc0/preprocess.py deleted file mode 100644 index c6645bdb5..000000000 --- a/examples/iwslt2012/punc0/preprocess.py +++ /dev/null @@ -1,27 +0,0 @@ -import argparse -import os - -def process_sentence(line): - if line == '': return '' - res = line[0] - for i in range(1, len(line)): - res += (' ' + line[i]) - return res - -if __name__ == "__main__": - paser = argparse.ArgumentParser(description = "Input filename") - paser.add_argument('-input_file') - paser.add_argument('-output_file') - sentence_cnt = 0 - args = paser.parse_args() - with open(args.input_file, 'r') as f: - with open(args.output_file, 'w') as write_f: - while True: - line = f.readline() - if line: - sentence_cnt += 1 - write_f.write(process_sentence(line)) - else: - break - print('preprocess over') - print('total sentences number:', sentence_cnt) From 9473b8468c2c979f69017be4eff9df80edc16c31 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Wed, 24 Aug 2022 14:42:31 +0800 Subject: [PATCH 3/3] Create preprocess.py If there are no spaces between sentences in your text file, use this file to generate a new file, which adds spaces between each token. --- examples/iwslt2012/punc0/local/preprocess.py | 27 ++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 examples/iwslt2012/punc0/local/preprocess.py diff --git a/examples/iwslt2012/punc0/local/preprocess.py b/examples/iwslt2012/punc0/local/preprocess.py new file mode 100644 index 000000000..03b27e89f --- /dev/null +++ b/examples/iwslt2012/punc0/local/preprocess.py @@ -0,0 +1,27 @@ +import argparse +import os + +def process_sentence(line): + if line == '': return '' + res = line[0] + for i in range(1, len(line)): + res += (' ' + line[i]) + return res + +if __name__ == "__main__": + paser = argparse.ArgumentParser(description = "Input filename") + paser.add_argument('-input_file') + paser.add_argument('-output_file') + sentence_cnt = 0 + args = paser.parse_args() + with open(args.input_file, 'r') as f: + with open(args.output_file, 'w') as write_f: + while True: + line = f.readline() + if line: + sentence_cnt += 1 + write_f.write(process_sentence(line)) + else: + break + print('preprocess over') + print('total sentences number:', sentence_cnt)