You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/speechx/examples/ngram/zh/local/text_to_lexicon.py

38 lines
1.1 KiB

#!/usr/bin/env python3
import argparse
2 years ago
from collections import Counter
2 years ago
def main(args):
2 years ago
counter = Counter()
with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
for line in fin:
line = line.strip()
if args.has_key:
utt, text = line.split(maxsplit=1)
words = text.split()
else:
words = line.split()
2 years ago
2 years ago
counter.update(words)
for word in counter:
val = " ".join(list(word))
fout.write(f"{word}\t{val}\n")
fout.flush()
2 years ago
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='text(line:utt1 中国 人) to lexiconline:中国 中 国).')
parser.add_argument(
2 years ago
'--has_key', default=True, help='text path, with utt or not')
parser.add_argument(
2 years ago
'--text', required=True, help='text path. line: utt1 中国 人 or 中国 人')
parser.add_argument(
2 years ago
'--lexicon', required=True, help='lexicon path. line:中国 中 国')
args = parser.parse_args()
print(args)
main(args)