Simplify codes and comments.

pull/2/head
yangyaming 7 years ago
parent 1325cd9b8e
commit c2e6378a64

@ -1,4 +1,7 @@
"""Set up paths for DS2""" """Set up paths for DS2"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os.path import os.path
import sys import sys

@ -1,4 +1,4 @@
"""Build vocabulary dictionary from manifest files. """Build vocabulary from manifest files.
Each item in vocabulary file is a character. Each item in vocabulary file is a character.
""" """
@ -11,13 +11,14 @@ import codecs
import json import json
from collections import Counter from collections import Counter
import os.path import os.path
import _init_paths
from data_utils import utils
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(description=__doc__)
description='Build vocabulary dictionary from transcription texts.')
parser.add_argument( parser.add_argument(
"--manifest_paths", "--manifest_paths",
type=str, type=str,
help="Manifest paths for building vocabulary dictionary." help="Manifest paths for building vocabulary."
"You can provide multiple manifest files.", "You can provide multiple manifest files.",
nargs='+', nargs='+',
required=True) required=True)
@ -25,25 +26,20 @@ parser.add_argument(
"--count_threshold", "--count_threshold",
default=0, default=0,
type=int, type=int,
help="Characters whose count below the threshold will be truncated. " help="Characters whose counts are below the threshold will be truncated. "
"(default: %(default)s)") "(default: %(default)i)")
parser.add_argument( parser.add_argument(
"--vocab_path", "--vocab_path",
default='datasets/vocab/zh_vocab.txt', default='datasets/vocab/zh_vocab.txt',
type=str, type=str,
help="Filepath to write vocabularies. (default: %(default)s)") help="File path to write the vocabulary. (default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
def count_manifest(counter, manifest_path): def count_manifest(counter, manifest_path):
for json_line in codecs.open(manifest_path, 'r', 'utf-8'): manifest_jsons = utils.read_manifest(manifest_path)
try: for line_json in manifest_jsons:
json_data = json.loads(json_line) for char in line_json['text']:
except Exception as e:
raise Exception('Error parsing manifest: %s, %s' % \
(manifest_path, e))
text = json_data['text']
for char in text:
counter.update(char) counter.update(char)
@ -54,9 +50,9 @@ def main():
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True) count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
with codecs.open(args.vocab_path, 'w', 'utf-8') as fout: with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
for item_pair in count_sorted: for char, count in count_sorted:
if item_pair[1] < args.count_threshold: break if count < args.count_threshold: break
fout.write(item_pair[0] + '\n') fout.write(char + '\n')
if __name__ == '__main__': if __name__ == '__main__':

Loading…
Cancel
Save