You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/paddlespeech/text/speechtask/punctuation_restoration/utils/punct_pre.py

164 lines
6.3 KiB

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
CHINESE_PUNCTUATION_MAPPING = {
'O': '',
'': "",
'': '',
'': '',
}
def process_one_file_chinese(raw_path, save_path):
f = open(raw_path, 'r', encoding='utf-8')
save_file = open(save_path, 'w', encoding='utf-8')
for line in f.readlines():
line = line.strip().replace(' ', '').replace(' ', '')
for i in line:
save_file.write(i + ' ')
save_file.write('\n')
save_file.close()
def process_chinese_pure_senetence(config):
####need raw_path, raw_train_file, raw_dev_file, raw_test_file, punc_file, save_path
assert os.path.exists(
os.path.join(config["raw_path"], config[
"raw_train_file"])), "train file doesn't exist."
assert os.path.exists(
os.path.join(config["raw_path"], config[
"raw_dev_file"])), "dev file doesn't exist."
assert os.path.exists(
os.path.join(config["raw_path"], config[
"raw_test_file"])), "test file doesn't exist."
assert os.path.exists(
os.path.join(config["raw_path"], config[
"punc_file"])), "punc file doesn't exist."
train_file = os.path.join(config["raw_path"], config["raw_train_file"])
dev_file = os.path.join(config["raw_path"], config["raw_dev_file"])
test_file = os.path.join(config["raw_path"], config["raw_test_file"])
if not os.path.exists(config["save_path"]):
os.makedirs(config["save_path"])
shutil.copy(
os.path.join(config["raw_path"], config["punc_file"]),
os.path.join(config["save_path"], config["punc_file"]))
process_one_file_chinese(train_file,
os.path.join(config["save_path"], "train"))
process_one_file_chinese(dev_file, os.path.join(config["save_path"], "dev"))
process_one_file_chinese(test_file,
os.path.join(config["save_path"], "test"))
def process_one_chinese_pair(raw_path, save_path):
f = open(raw_path, 'r', encoding='utf-8')
save_file = open(save_path, 'w', encoding='utf-8')
for line in f.readlines():
if (len(line.strip().split()) == 2):
word, punc = line.strip().split()
save_file.write(word + ' ' + CHINESE_PUNCTUATION_MAPPING[punc])
if (punc == ""):
save_file.write("\n")
else:
save_file.write(" ")
save_file.close()
def process_chinese_pair(config):
### need raw_path, raw_train_file, raw_dev_file, raw_test_file, punc_file, save_path
assert os.path.exists(
os.path.join(config["raw_path"], config[
"raw_train_file"])), "train file doesn't exist."
assert os.path.exists(
os.path.join(config["raw_path"], config[
"raw_dev_file"])), "dev file doesn't exist."
assert os.path.exists(
os.path.join(config["raw_path"], config[
"raw_test_file"])), "test file doesn't exist."
assert os.path.exists(
os.path.join(config["raw_path"], config[
"punc_file"])), "punc file doesn't exist."
train_file = os.path.join(config["raw_path"], config["raw_train_file"])
dev_file = os.path.join(config["raw_path"], config["raw_dev_file"])
test_file = os.path.join(config["raw_path"], config["raw_test_file"])
process_one_chinese_pair(train_file,
os.path.join(config["save_path"], "train"))
process_one_chinese_pair(dev_file, os.path.join(config["save_path"], "dev"))
process_one_chinese_pair(test_file,
os.path.join(config["save_path"], "test"))
shutil.copy(
os.path.join(config["raw_path"], config["punc_file"]),
os.path.join(config["save_path"], config["punc_file"]))
english_punc = [',', '.', '?']
ignore_english_punc = ['\"', '/']
def process_one_file_english(raw_path, save_path):
f = open(raw_path, 'r', encoding='utf-8')
save_file = open(save_path, 'w', encoding='utf-8')
for line in f.readlines():
for i in ignore_english_punc:
line = line.replace(i, '')
for i in english_punc:
line = line.replace(i, ' ' + i)
wordlist = line.strip().split(' ')
# print(type(wordlist))
# print(wordlist)
for i in wordlist:
save_file.write(i + ' ')
save_file.write('\n')
save_file.close()
def process_english_pure_senetence(config):
####need raw_path, raw_train_file, raw_dev_file, raw_test_file, punc_file, save_path
assert os.path.exists(
os.path.join(config["raw_path"], config[
"raw_train_file"])), "train file doesn't exist."
assert os.path.exists(
os.path.join(config["raw_path"], config[
"raw_dev_file"])), "dev file doesn't exist."
assert os.path.exists(
os.path.join(config["raw_path"], config[
"raw_test_file"])), "test file doesn't exist."
assert os.path.exists(
os.path.join(config["raw_path"], config[
"punc_file"])), "punc file doesn't exist."
train_file = os.path.join(config["raw_path"], config["raw_train_file"])
dev_file = os.path.join(config["raw_path"], config["raw_dev_file"])
test_file = os.path.join(config["raw_path"], config["raw_test_file"])
if not os.path.exists(config["save_path"]):
os.makedirs(config["save_path"])
shutil.copy(
os.path.join(config["raw_path"], config["punc_file"]),
os.path.join(config["save_path"], config["punc_file"]))
process_one_file_english(train_file,
os.path.join(config["save_path"], "train"))
process_one_file_english(dev_file, os.path.join(config["save_path"], "dev"))
process_one_file_english(test_file,
os.path.join(config["save_path"], "test"))