#!/usr/bin/env python3 import cProfile import getopt import os import pstats import sys from io import StringIO from os.path import dirname from os.path import join import mmseg class Dictionary(): dictionaries = ( ('chars', os.path.join(os.path.dirname(__file__), '../mmseg/data', 'chars.dic')), ('words', os.path.join(os.path.dirname(__file__), '../mmseg/data', 'words.dic')), ) @staticmethod def load_dictionaries(): for t, d in Dictionary.dictionaries: if t == 'chars': if not mmseg.load_chars(d): raise IOError("Cannot open '%s'" % d) elif t == 'words': if not mmseg.load_words(d): raise IOError("Cannot open '%s'" % d) mmseg.dict_load_defaults = Dictionary.load_dictionaries class Algorithm(object): def __init__(self, text: str): """\ Create an Algorithm instance to segment text. """ self.text = text.encode('utf8') # add a reference to prevent the string buffer from # being GC-ed self.algor = mmseg.Algorithm(text) self.destroied = False def __iter__(self): """\ Iterate through all tokens. Note the iteration has side-effect: an Algorithm object can only be iterated once. """ while True: tk = self.next_token() if tk is None: raise StopIteration yield tk def next_token(self): """\ Get next token. When no token available, return None. """ if self.destroied: return None tk = self.algor.next_token() if tk.length == 0: # no token available, the algorithm object # can be destroied self._destroy() return None else: return tk def _destroy(self): if not self.destroied: self.destroied = True def __del__(self): self._destroy() def profile(fn): def wrapper(*args, **kwargs): profiler = cProfile.Profile() stream = StringIO() profiler.enable() try: res = fn(*args, **kwargs) finally: profiler.disable() stats = pstats.Stats(profiler, stream=stream) stats.sort_stats('time') print("", file=stream) print("=" * 100, file=stream) print("Stats:", file=stream) stats.print_stats() print("=" * 100, file=stream) print("Callers:", file=stream) stats.print_callers() print("=" * 100, file=stream) print("Callees:", file=stream) stats.print_callees() print(stream.getvalue(), file=sys.stderr) stream.close() return res return wrapper def print_usage(): print(""" mmseg Segment Chinese text. Read from stdin and print to stdout. Options: -h --help Print this message -s --separator Select the separator of the segmented text. Default is space. """) sys.exit(0) separator = " " optlst, args = getopt.getopt(sys.argv[1:], 'hs:') for opt, val in optlst: if opt == '-h': print_usage() elif opt == '-s': separator = val # load default dictionaries mmseg.dict_load_defaults() def process_tokens(stdin, separator): ret = '' first = True algor = Algorithm(stdin) try: for tk in algor: if not first: ret += separator ret += tk.text first = False except RuntimeError: pass return ret sys.stdout.write(process_tokens(sys.stdin.read(), separator)) sys.stdout.write('\n')