import argparse import os import re replace_ = {"#1": "%", "#2": "`", "#3": "~", "#4": "$"} def replace_rhy_with_punc(line): # r'[:、,;。?!,.:;"?!”’《》【】<=>{}()()#&@“”^_|…\\]%*$', '', line) #参考checkcheck_oov.py, line = re.sub(r'^$\*%', '', line) for r in replace_.keys(): if r in line: line = line.replace(r, replace_[r]) return line def pre_and_write(data, file): with open(file, 'w') as rf: for d in data: d = d.split('\t')[1].strip() d = replace_rhy_with_punc(d) d = ' '.join(d) + ' \n' rf.write(d) def main(): parser = argparse.ArgumentParser( description="Train a Rhy prediction model.") parser.add_argument("--data", type=str, default="label_train-set.txt") parser.add_argument( "--processed_path", type=str, default="../data/rhy_predict") args = parser.parse_args() print(args.data, args.processed_path) os.makedirs(args.processed_path, exist_ok=True) with open(args.data) as rf: rf = rf.readlines() text = rf[0::2] len_ = len(text) lens = [int(len_ * 0.9), int(len_ * 0.05), int(len_ * 0.05)] files = ['train.txt', 'test.txt', 'dev.txt'] i = 0 for l_, file in zip(lens, files): file = os.path.join(args.processed_path, file) pre_and_write(text[i:i + l_], file) i = i + l_ if __name__ == "__main__": main()