parent
7e39debcb0
commit
1325cd9b8e
@ -0,0 +1,16 @@
|
||||
"""Set up paths for DS2"""
|
||||
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
|
||||
def add_path(path):
|
||||
if path not in sys.path:
|
||||
sys.path.insert(0, path)
|
||||
|
||||
|
||||
this_dir = os.path.dirname(__file__)
|
||||
|
||||
# Add project path to PYTHONPATH
|
||||
proj_path = os.path.join(this_dir, '..')
|
||||
add_path(proj_path)
|
@ -0,0 +1,63 @@
|
||||
"""Build vocabulary dictionary from manifest files.
|
||||
|
||||
Each item in vocabulary file is a character.
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import codecs
|
||||
import json
|
||||
from collections import Counter
|
||||
import os.path
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Build vocabulary dictionary from transcription texts.')
|
||||
parser.add_argument(
|
||||
"--manifest_paths",
|
||||
type=str,
|
||||
help="Manifest paths for building vocabulary dictionary."
|
||||
"You can provide multiple manifest files.",
|
||||
nargs='+',
|
||||
required=True)
|
||||
parser.add_argument(
|
||||
"--count_threshold",
|
||||
default=0,
|
||||
type=int,
|
||||
help="Characters whose count below the threshold will be truncated. "
|
||||
"(default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--vocab_path",
|
||||
default='datasets/vocab/zh_vocab.txt',
|
||||
type=str,
|
||||
help="Filepath to write vocabularies. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def count_manifest(counter, manifest_path):
|
||||
for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
|
||||
try:
|
||||
json_data = json.loads(json_line)
|
||||
except Exception as e:
|
||||
raise Exception('Error parsing manifest: %s, %s' % \
|
||||
(manifest_path, e))
|
||||
text = json_data['text']
|
||||
for char in text:
|
||||
counter.update(char)
|
||||
|
||||
|
||||
def main():
|
||||
counter = Counter()
|
||||
for manifest_path in args.manifest_paths:
|
||||
count_manifest(counter, manifest_path)
|
||||
|
||||
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
|
||||
with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
|
||||
for item_pair in count_sorted:
|
||||
if item_pair[1] < args.count_threshold: break
|
||||
fout.write(item_pair[0] + '\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in new issue