From f852514a3ef31b32f583c17bc282e4e0db809719 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Wed, 25 May 2022 21:25:01 +0800 Subject: [PATCH] mv text_to_lexicon.py to utils --- .../examples/ds2_ol/aishell/run_build_tlg.sh | 2 +- utils/text_to_lexicon.py | 37 +++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100755 utils/text_to_lexicon.py diff --git a/speechx/examples/ds2_ol/aishell/run_build_tlg.sh b/speechx/examples/ds2_ol/aishell/run_build_tlg.sh index 68a31de4f..4394ac5a0 100755 --- a/speechx/examples/ds2_ol/aishell/run_build_tlg.sh +++ b/speechx/examples/ds2_ol/aishell/run_build_tlg.sh @@ -55,7 +55,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then cp $unit data/local/dict/units.txt if [ ! -f $lexicon ];then - local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon + utils/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon echo "Generate $lexicon from $text" fi diff --git a/utils/text_to_lexicon.py b/utils/text_to_lexicon.py new file mode 100755 index 000000000..ba5ab60ac --- /dev/null +++ b/utils/text_to_lexicon.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +import argparse +from collections import Counter + + +def main(args): + counter = Counter() + with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout: + for line in fin: + line = line.strip() + if args.has_key: + utt, text = line.split(maxsplit=1) + words = text.split() + else: + words = line.split() + + counter.update(words) + + for word in counter: + val = " ".join(list(word)) + fout.write(f"{word}\t{val}\n") + fout.flush() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='text(line:utt1 中国 人) to lexicon(line:中国 中 国).') + parser.add_argument( + '--has_key', default=True, help='text path, with utt or not') + parser.add_argument( + '--text', required=True, help='text path. line: utt1 中国 人 or 中国 人') + parser.add_argument( + '--lexicon', required=True, help='lexicon path. line:中国 中 国') + args = parser.parse_args() + print(args) + + main(args)