You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/tools/build_vocab.py

59 lines
1.6 KiB

"""Build vocabulary from manifest files.
Each item in vocabulary file is a character.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import functools
import codecs
import json
from collections import Counter
import os.path
import _init_paths
from data_utils.utility import read_manifest
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('count_threshold', int, 0, "Truncation threshold for char counts.")
add_arg('vocab_path', str,
'data/librispeech/vocab.txt',
"Filepath to write the vocabulary.")
add_arg('manifest_paths', str,
None,
"Filepaths of manifests for building vocabulary. "
"You can provide multiple manifest files.",
nargs='+',
required=True)
# yapf: disable
args = parser.parse_args()
def count_manifest(counter, manifest_path):
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
for char in line_json['text']:
counter.update(char)
def main():
print_arguments(args)
counter = Counter()
for manifest_path in args.manifest_paths:
count_manifest(counter, manifest_path)
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
for char, count in count_sorted:
if count < args.count_threshold: break
fout.write(char + '\n')
if __name__ == '__main__':
main()