From c2e6378a64b1526076e4fb99aa6f9228d25891c8 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 9 Aug 2017 23:03:30 +0800 Subject: [PATCH] Simplify codes and comments. --- tools/_init_paths.py | 3 +++ tools/build_vocab.py | 32 ++++++++++++++------------------ 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/tools/_init_paths.py b/tools/_init_paths.py index 3bb2fd19..ddabb535 100644 --- a/tools/_init_paths.py +++ b/tools/_init_paths.py @@ -1,4 +1,7 @@ """Set up paths for DS2""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function import os.path import sys diff --git a/tools/build_vocab.py b/tools/build_vocab.py index 59be4031..618f2498 100644 --- a/tools/build_vocab.py +++ b/tools/build_vocab.py @@ -1,4 +1,4 @@ -"""Build vocabulary dictionary from manifest files. +"""Build vocabulary from manifest files. Each item in vocabulary file is a character. """ @@ -11,13 +11,14 @@ import codecs import json from collections import Counter import os.path +import _init_paths +from data_utils import utils -parser = argparse.ArgumentParser( - description='Build vocabulary dictionary from transcription texts.') +parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--manifest_paths", type=str, - help="Manifest paths for building vocabulary dictionary." + help="Manifest paths for building vocabulary." "You can provide multiple manifest files.", nargs='+', required=True) @@ -25,25 +26,20 @@ parser.add_argument( "--count_threshold", default=0, type=int, - help="Characters whose count below the threshold will be truncated. " - "(default: %(default)s)") + help="Characters whose counts are below the threshold will be truncated. " + "(default: %(default)i)") parser.add_argument( "--vocab_path", default='datasets/vocab/zh_vocab.txt', type=str, - help="Filepath to write vocabularies. (default: %(default)s)") + help="File path to write the vocabulary. (default: %(default)s)") args = parser.parse_args() def count_manifest(counter, manifest_path): - for json_line in codecs.open(manifest_path, 'r', 'utf-8'): - try: - json_data = json.loads(json_line) - except Exception as e: - raise Exception('Error parsing manifest: %s, %s' % \ - (manifest_path, e)) - text = json_data['text'] - for char in text: + manifest_jsons = utils.read_manifest(manifest_path) + for line_json in manifest_jsons: + for char in line_json['text']: counter.update(char) @@ -54,9 +50,9 @@ def main(): count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True) with codecs.open(args.vocab_path, 'w', 'utf-8') as fout: - for item_pair in count_sorted: - if item_pair[1] < args.count_threshold: break - fout.write(item_pair[0] + '\n') + for char, count in count_sorted: + if count < args.count_threshold: break + fout.write(char + '\n') if __name__ == '__main__':