You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/examples/other/rhy/local/pre_for_sp_csmsc.py

52 lines
1.4 KiB

import argparse
import os
import re
replace_ = {"#1": "%", "#2": "`", "#3": "~", "#4": "$"}
def replace_rhy_with_punc(line):
# r'[:、,;。?!,.:;"?!”’《》【】<=>{}()#&@“”^_|…\\]%*$', '', line) #参考checkcheck_oov.py,
line = re.sub(r'^$\*%', '', line)
for r in replace_.keys():
if r in line:
line = line.replace(r, replace_[r])
return line
def pre_and_write(data, file):
with open(file, 'w') as rf:
for d in data:
d = d.split('\t')[1].strip()
d = replace_rhy_with_punc(d)
d = ' '.join(d) + ' \n'
rf.write(d)
def main():
parser = argparse.ArgumentParser(
description="Train a Rhy prediction model.")
parser.add_argument("--data", type=str, default="label_train-set.txt")
parser.add_argument(
"--processed_path", type=str, default="../data/rhy_predict")
args = parser.parse_args()
print(args.data, args.processed_path)
os.makedirs(args.processed_path, exist_ok=True)
with open(args.data) as rf:
rf = rf.readlines()
text = rf[0::2]
len_ = len(text)
lens = [int(len_ * 0.9), int(len_ * 0.05), int(len_ * 0.05)]
files = ['train.txt', 'test.txt', 'dev.txt']
i = 0
for l_, file in zip(lens, files):
file = os.path.join(args.processed_path, file)
pre_and_write(text[i:i + l_], file)
i = i + l_
if __name__ == "__main__":
main()