|
|
@ -1,4 +1,4 @@
|
|
|
|
"""Build vocabulary dictionary from manifest files.
|
|
|
|
"""Build vocabulary from manifest files.
|
|
|
|
|
|
|
|
|
|
|
|
Each item in vocabulary file is a character.
|
|
|
|
Each item in vocabulary file is a character.
|
|
|
|
"""
|
|
|
|
"""
|
|
|
@ -11,13 +11,14 @@ import codecs
|
|
|
|
import json
|
|
|
|
import json
|
|
|
|
from collections import Counter
|
|
|
|
from collections import Counter
|
|
|
|
import os.path
|
|
|
|
import os.path
|
|
|
|
|
|
|
|
import _init_paths
|
|
|
|
|
|
|
|
from data_utils import utils
|
|
|
|
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
|
|
description='Build vocabulary dictionary from transcription texts.')
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
parser.add_argument(
|
|
|
|
"--manifest_paths",
|
|
|
|
"--manifest_paths",
|
|
|
|
type=str,
|
|
|
|
type=str,
|
|
|
|
help="Manifest paths for building vocabulary dictionary."
|
|
|
|
help="Manifest paths for building vocabulary."
|
|
|
|
"You can provide multiple manifest files.",
|
|
|
|
"You can provide multiple manifest files.",
|
|
|
|
nargs='+',
|
|
|
|
nargs='+',
|
|
|
|
required=True)
|
|
|
|
required=True)
|
|
|
@ -25,25 +26,20 @@ parser.add_argument(
|
|
|
|
"--count_threshold",
|
|
|
|
"--count_threshold",
|
|
|
|
default=0,
|
|
|
|
default=0,
|
|
|
|
type=int,
|
|
|
|
type=int,
|
|
|
|
help="Characters whose count below the threshold will be truncated. "
|
|
|
|
help="Characters whose counts are below the threshold will be truncated. "
|
|
|
|
"(default: %(default)s)")
|
|
|
|
"(default: %(default)i)")
|
|
|
|
parser.add_argument(
|
|
|
|
parser.add_argument(
|
|
|
|
"--vocab_path",
|
|
|
|
"--vocab_path",
|
|
|
|
default='datasets/vocab/zh_vocab.txt',
|
|
|
|
default='datasets/vocab/zh_vocab.txt',
|
|
|
|
type=str,
|
|
|
|
type=str,
|
|
|
|
help="Filepath to write vocabularies. (default: %(default)s)")
|
|
|
|
help="File path to write the vocabulary. (default: %(default)s)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def count_manifest(counter, manifest_path):
|
|
|
|
def count_manifest(counter, manifest_path):
|
|
|
|
for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
|
|
|
|
manifest_jsons = utils.read_manifest(manifest_path)
|
|
|
|
try:
|
|
|
|
for line_json in manifest_jsons:
|
|
|
|
json_data = json.loads(json_line)
|
|
|
|
for char in line_json['text']:
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
raise Exception('Error parsing manifest: %s, %s' % \
|
|
|
|
|
|
|
|
(manifest_path, e))
|
|
|
|
|
|
|
|
text = json_data['text']
|
|
|
|
|
|
|
|
for char in text:
|
|
|
|
|
|
|
|
counter.update(char)
|
|
|
|
counter.update(char)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -54,9 +50,9 @@ def main():
|
|
|
|
|
|
|
|
|
|
|
|
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
|
|
|
|
with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
|
|
|
|
for item_pair in count_sorted:
|
|
|
|
for char, count in count_sorted:
|
|
|
|
if item_pair[1] < args.count_threshold: break
|
|
|
|
if count < args.count_threshold: break
|
|
|
|
fout.write(item_pair[0] + '\n')
|
|
|
|
fout.write(char + '\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
if __name__ == '__main__':
|
|
|
|