From c2e6378a64b1526076e4fb99aa6f9228d25891c8 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Wed, 9 Aug 2017 23:03:30 +0800
Subject: [PATCH] Simplify codes and comments.

---
 tools/_init_paths.py |  3 +++
 tools/build_vocab.py | 32 ++++++++++++++------------------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/tools/_init_paths.py b/tools/_init_paths.py
index 3bb2fd19..ddabb535 100644
--- a/tools/_init_paths.py
+++ b/tools/_init_paths.py
@@ -1,4 +1,7 @@
 """Set up paths for DS2"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 
 import os.path
 import sys
diff --git a/tools/build_vocab.py b/tools/build_vocab.py
index 59be4031..618f2498 100644
--- a/tools/build_vocab.py
+++ b/tools/build_vocab.py
@@ -1,4 +1,4 @@
-"""Build vocabulary dictionary from manifest files.
+"""Build vocabulary from manifest files.
 
 Each item in vocabulary file is a character.
 """
@@ -11,13 +11,14 @@ import codecs
 import json
 from collections import Counter
 import os.path
+import _init_paths
+from data_utils import utils
 
-parser = argparse.ArgumentParser(
-    description='Build vocabulary dictionary from transcription texts.')
+parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
     "--manifest_paths",
     type=str,
-    help="Manifest paths for building vocabulary dictionary."
+    help="Manifest paths for building vocabulary."
     "You can provide multiple manifest files.",
     nargs='+',
     required=True)
@@ -25,25 +26,20 @@ parser.add_argument(
     "--count_threshold",
     default=0,
     type=int,
-    help="Characters whose count below the threshold will be truncated. "
-    "(default: %(default)s)")
+    help="Characters whose counts are below the threshold will be truncated. "
+    "(default: %(default)i)")
 parser.add_argument(
     "--vocab_path",
     default='datasets/vocab/zh_vocab.txt',
     type=str,
-    help="Filepath to write vocabularies. (default: %(default)s)")
+    help="File path to write the vocabulary. (default: %(default)s)")
 args = parser.parse_args()
 
 
 def count_manifest(counter, manifest_path):
-    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
-        try:
-            json_data = json.loads(json_line)
-        except Exception as e:
-            raise Exception('Error parsing manifest: %s, %s' % \
-                    (manifest_path, e))
-        text = json_data['text']
-        for char in text:
+    manifest_jsons = utils.read_manifest(manifest_path)
+    for line_json in manifest_jsons:
+        for char in line_json['text']:
             counter.update(char)
 
 
@@ -54,9 +50,9 @@ def main():
 
     count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
     with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
-        for item_pair in count_sorted:
-            if item_pair[1] < args.count_threshold: break
-            fout.write(item_pair[0] + '\n')
+        for char, count in count_sorted:
+            if count < args.count_threshold: break
+            fout.write(char + '\n')
 
 
 if __name__ == '__main__':