Simplify codes and comments.

8 years ago · c2e6378a64
parent 1325cd9b8e
commit c2e6378a64
2 changed files with 17 additions and 18 deletions
--- a/tools/_init_paths.py
+++ b/tools/_init_paths.py
@ -1,4 +1,7 @@
 """Set up paths for DS2"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os.path
 import sys
--- a/tools/build_vocab.py
+++ b/tools/build_vocab.py
@ -1,4 +1,4 @@
-"""Build vocabulary dictionary from manifest files.
+"""Build vocabulary from manifest files.
 Each item in vocabulary file is a character.
 """
@ -11,13 +11,14 @@ import codecs
 import json
 from collections import Counter
 import os.path
 import _init_paths
 from data_utils import utils
-parser = argparse.ArgumentParser(
+parser = argparse.ArgumentParser(description=__doc__)
    description='Build vocabulary dictionary from transcription texts.')
 parser.add_argument(
    "--manifest_paths",
    type=str,
-    help="Manifest paths for building vocabulary dictionary."
+    help="Manifest paths for building vocabulary."
    "You can provide multiple manifest files.",
    nargs='+',
    required=True)
@ -25,25 +26,20 @@ parser.add_argument(
    "--count_threshold",
    default=0,
    type=int,
-    help="Characters whose count below the threshold will be truncated. "
+    help="Characters whose counts are below the threshold will be truncated. "
-    "(default: %(default)s)")
+    "(default: %(default)i)")
 parser.add_argument(
    "--vocab_path",
    default='datasets/vocab/zh_vocab.txt',
    type=str,
-    help="Filepath to write vocabularies. (default: %(default)s)")
+    help="File path to write the vocabulary. (default: %(default)s)")
 args = parser.parse_args()
 def count_manifest(counter, manifest_path):
-    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
+    manifest_jsons = utils.read_manifest(manifest_path)
-        try:
+    for line_json in manifest_jsons:
-            json_data = json.loads(json_line)
+        for char in line_json['text']:
        except Exception as e:
            raise Exception('Error parsing manifest: %s, %s' % \
                    (manifest_path, e))
        text = json_data['text']
        for char in text:
            counter.update(char)
@ -54,9 +50,9 @@ def main():
    count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
-        for item_pair in count_sorted:
+        for char, count in count_sorted:
-            if item_pair[1] < args.count_threshold: break
+            if count < args.count_threshold: break
-            fout.write(item_pair[0] + '\n')
+            fout.write(char + '\n')
 if __name__ == '__main__':