You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/tools/build_vocab.py

64 lines
1.8 KiB

"""Build vocabulary dictionary from manifest files.
Each item in vocabulary file is a character.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import codecs
import json
from collections import Counter
import os.path
parser = argparse.ArgumentParser(
description='Build vocabulary dictionary from transcription texts.')
parser.add_argument(
"--manifest_paths",
type=str,
help="Manifest paths for building vocabulary dictionary."
"You can provide multiple manifest files.",
nargs='+',
required=True)
parser.add_argument(
"--count_threshold",
default=0,
type=int,
help="Characters whose count below the threshold will be truncated. "
"(default: %(default)s)")
parser.add_argument(
"--vocab_path",
default='datasets/vocab/zh_vocab.txt',
type=str,
help="Filepath to write vocabularies. (default: %(default)s)")
args = parser.parse_args()
def count_manifest(counter, manifest_path):
for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
try:
json_data = json.loads(json_line)
except Exception as e:
raise Exception('Error parsing manifest: %s, %s' % \
(manifest_path, e))
text = json_data['text']
for char in text:
counter.update(char)
def main():
counter = Counter()
for manifest_path in args.manifest_paths:
count_manifest(counter, manifest_path)
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
for item_pair in count_sorted:
if item_pair[1] < args.count_threshold: break
fout.write(item_pair[0] + '\n')
if __name__ == '__main__':
main()