|
|
|
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
import os
|
|
|
|
|
import shutil
|
|
|
|
|
|
|
|
|
|
CHINESE_PUNCTUATION_MAPPING = {
|
|
|
|
|
'O': '',
|
|
|
|
|
',': ",",
|
|
|
|
|
'。': '。',
|
|
|
|
|
'?': '?',
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_one_file_chinese(raw_path, save_path):
|
|
|
|
|
f = open(raw_path, 'r', encoding='utf-8')
|
|
|
|
|
save_file = open(save_path, 'w', encoding='utf-8')
|
|
|
|
|
for line in f.readlines():
|
|
|
|
|
line = line.strip().replace(' ', '').replace(' ', '')
|
|
|
|
|
for i in line:
|
|
|
|
|
save_file.write(i + ' ')
|
|
|
|
|
save_file.write('\n')
|
|
|
|
|
save_file.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_chinese_pure_senetence(config):
|
|
|
|
|
####need raw_path, raw_train_file, raw_dev_file, raw_test_file, punc_file, save_path
|
|
|
|
|
assert os.path.exists(
|
|
|
|
|
os.path.join(config["raw_path"], config[
|
|
|
|
|
"raw_train_file"])), "train file doesn't exist."
|
|
|
|
|
assert os.path.exists(
|
|
|
|
|
os.path.join(config["raw_path"], config[
|
|
|
|
|
"raw_dev_file"])), "dev file doesn't exist."
|
|
|
|
|
assert os.path.exists(
|
|
|
|
|
os.path.join(config["raw_path"], config[
|
|
|
|
|
"raw_test_file"])), "test file doesn't exist."
|
|
|
|
|
assert os.path.exists(
|
|
|
|
|
os.path.join(config["raw_path"], config[
|
|
|
|
|
"punc_file"])), "punc file doesn't exist."
|
|
|
|
|
|
|
|
|
|
train_file = os.path.join(config["raw_path"], config["raw_train_file"])
|
|
|
|
|
dev_file = os.path.join(config["raw_path"], config["raw_dev_file"])
|
|
|
|
|
test_file = os.path.join(config["raw_path"], config["raw_test_file"])
|
|
|
|
|
if not os.path.exists(config["save_path"]):
|
|
|
|
|
os.makedirs(config["save_path"])
|
|
|
|
|
|
|
|
|
|
shutil.copy(
|
|
|
|
|
os.path.join(config["raw_path"], config["punc_file"]),
|
|
|
|
|
os.path.join(config["save_path"], config["punc_file"]))
|
|
|
|
|
|
|
|
|
|
process_one_file_chinese(train_file,
|
|
|
|
|
os.path.join(config["save_path"], "train"))
|
|
|
|
|
process_one_file_chinese(dev_file, os.path.join(config["save_path"], "dev"))
|
|
|
|
|
process_one_file_chinese(test_file,
|
|
|
|
|
os.path.join(config["save_path"], "test"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_one_chinese_pair(raw_path, save_path):
|
|
|
|
|
|
|
|
|
|
f = open(raw_path, 'r', encoding='utf-8')
|
|
|
|
|
save_file = open(save_path, 'w', encoding='utf-8')
|
|
|
|
|
for line in f.readlines():
|
|
|
|
|
if (len(line.strip().split()) == 2):
|
|
|
|
|
word, punc = line.strip().split()
|
|
|
|
|
save_file.write(word + ' ' + CHINESE_PUNCTUATION_MAPPING[punc])
|
|
|
|
|
if (punc == "。"):
|
|
|
|
|
save_file.write("\n")
|
|
|
|
|
else:
|
|
|
|
|
save_file.write(" ")
|
|
|
|
|
save_file.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_chinese_pair(config):
|
|
|
|
|
### need raw_path, raw_train_file, raw_dev_file, raw_test_file, punc_file, save_path
|
|
|
|
|
assert os.path.exists(
|
|
|
|
|
os.path.join(config["raw_path"], config[
|
|
|
|
|
"raw_train_file"])), "train file doesn't exist."
|
|
|
|
|
assert os.path.exists(
|
|
|
|
|
os.path.join(config["raw_path"], config[
|
|
|
|
|
"raw_dev_file"])), "dev file doesn't exist."
|
|
|
|
|
assert os.path.exists(
|
|
|
|
|
os.path.join(config["raw_path"], config[
|
|
|
|
|
"raw_test_file"])), "test file doesn't exist."
|
|
|
|
|
assert os.path.exists(
|
|
|
|
|
os.path.join(config["raw_path"], config[
|
|
|
|
|
"punc_file"])), "punc file doesn't exist."
|
|
|
|
|
|
|
|
|
|
train_file = os.path.join(config["raw_path"], config["raw_train_file"])
|
|
|
|
|
dev_file = os.path.join(config["raw_path"], config["raw_dev_file"])
|
|
|
|
|
test_file = os.path.join(config["raw_path"], config["raw_test_file"])
|
|
|
|
|
|
|
|
|
|
process_one_chinese_pair(train_file,
|
|
|
|
|
os.path.join(config["save_path"], "train"))
|
|
|
|
|
process_one_chinese_pair(dev_file, os.path.join(config["save_path"], "dev"))
|
|
|
|
|
process_one_chinese_pair(test_file,
|
|
|
|
|
os.path.join(config["save_path"], "test"))
|
|
|
|
|
|
|
|
|
|
shutil.copy(
|
|
|
|
|
os.path.join(config["raw_path"], config["punc_file"]),
|
|
|
|
|
os.path.join(config["save_path"], config["punc_file"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
english_punc = [',', '.', '?']
|
|
|
|
|
ignore_english_punc = ['\"', '/']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_one_file_english(raw_path, save_path):
|
|
|
|
|
f = open(raw_path, 'r', encoding='utf-8')
|
|
|
|
|
save_file = open(save_path, 'w', encoding='utf-8')
|
|
|
|
|
for line in f.readlines():
|
|
|
|
|
for i in ignore_english_punc:
|
|
|
|
|
line = line.replace(i, '')
|
|
|
|
|
for i in english_punc:
|
|
|
|
|
line = line.replace(i, ' ' + i)
|
|
|
|
|
wordlist = line.strip().split(' ')
|
|
|
|
|
# print(type(wordlist))
|
|
|
|
|
# print(wordlist)
|
|
|
|
|
for i in wordlist:
|
|
|
|
|
save_file.write(i + ' ')
|
|
|
|
|
save_file.write('\n')
|
|
|
|
|
save_file.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_english_pure_senetence(config):
|
|
|
|
|
####need raw_path, raw_train_file, raw_dev_file, raw_test_file, punc_file, save_path
|
|
|
|
|
assert os.path.exists(
|
|
|
|
|
os.path.join(config["raw_path"], config[
|
|
|
|
|
"raw_train_file"])), "train file doesn't exist."
|
|
|
|
|
assert os.path.exists(
|
|
|
|
|
os.path.join(config["raw_path"], config[
|
|
|
|
|
"raw_dev_file"])), "dev file doesn't exist."
|
|
|
|
|
assert os.path.exists(
|
|
|
|
|
os.path.join(config["raw_path"], config[
|
|
|
|
|
"raw_test_file"])), "test file doesn't exist."
|
|
|
|
|
assert os.path.exists(
|
|
|
|
|
os.path.join(config["raw_path"], config[
|
|
|
|
|
"punc_file"])), "punc file doesn't exist."
|
|
|
|
|
|
|
|
|
|
train_file = os.path.join(config["raw_path"], config["raw_train_file"])
|
|
|
|
|
dev_file = os.path.join(config["raw_path"], config["raw_dev_file"])
|
|
|
|
|
test_file = os.path.join(config["raw_path"], config["raw_test_file"])
|
|
|
|
|
if not os.path.exists(config["save_path"]):
|
|
|
|
|
os.makedirs(config["save_path"])
|
|
|
|
|
|
|
|
|
|
shutil.copy(
|
|
|
|
|
os.path.join(config["raw_path"], config["punc_file"]),
|
|
|
|
|
os.path.join(config["save_path"], config["punc_file"]))
|
|
|
|
|
|
|
|
|
|
process_one_file_english(train_file,
|
|
|
|
|
os.path.join(config["save_path"], "train"))
|
|
|
|
|
process_one_file_english(dev_file, os.path.join(config["save_path"], "dev"))
|
|
|
|
|
process_one_file_english(test_file,
|
|
|
|
|
os.path.join(config["save_path"], "test"))
|