Simplify codes and comments.

pull/2/head
yangyaming 7 years ago
parent 1325cd9b8e
commit c2e6378a64

@ -1,4 +1,7 @@
"""Set up paths for DS2"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os.path
import sys

@ -1,4 +1,4 @@
"""Build vocabulary dictionary from manifest files.
"""Build vocabulary from manifest files.
Each item in vocabulary file is a character.
"""
@ -11,13 +11,14 @@ import codecs
import json
from collections import Counter
import os.path
import _init_paths
from data_utils import utils
parser = argparse.ArgumentParser(
description='Build vocabulary dictionary from transcription texts.')
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--manifest_paths",
type=str,
help="Manifest paths for building vocabulary dictionary."
help="Manifest paths for building vocabulary."
"You can provide multiple manifest files.",
nargs='+',
required=True)
@ -25,25 +26,20 @@ parser.add_argument(
"--count_threshold",
default=0,
type=int,
help="Characters whose count below the threshold will be truncated. "
"(default: %(default)s)")
help="Characters whose counts are below the threshold will be truncated. "
"(default: %(default)i)")
parser.add_argument(
"--vocab_path",
default='datasets/vocab/zh_vocab.txt',
type=str,
help="Filepath to write vocabularies. (default: %(default)s)")
help="File path to write the vocabulary. (default: %(default)s)")
args = parser.parse_args()
def count_manifest(counter, manifest_path):
for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
try:
json_data = json.loads(json_line)
except Exception as e:
raise Exception('Error parsing manifest: %s, %s' % \
(manifest_path, e))
text = json_data['text']
for char in text:
manifest_jsons = utils.read_manifest(manifest_path)
for line_json in manifest_jsons:
for char in line_json['text']:
counter.update(char)
@ -54,9 +50,9 @@ def main():
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
for item_pair in count_sorted:
if item_pair[1] < args.count_threshold: break
fout.write(item_pair[0] + '\n')
for char, count in count_sorted:
if count < args.count_threshold: break
fout.write(char + '\n')
if __name__ == '__main__':

Loading…
Cancel
Save