You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
159 lines
3.7 KiB
159 lines
3.7 KiB
#!/usr/bin/env python3
|
|
import cProfile
|
|
import getopt
|
|
import os
|
|
import pstats
|
|
import sys
|
|
from io import StringIO
|
|
from os.path import dirname
|
|
from os.path import join
|
|
|
|
import mmseg
|
|
|
|
|
|
class Dictionary():
|
|
dictionaries = (
|
|
('chars',
|
|
os.path.join(os.path.dirname(__file__), '../mmseg/data', 'chars.dic')),
|
|
('words',
|
|
os.path.join(os.path.dirname(__file__), '../mmseg/data', 'words.dic')),
|
|
)
|
|
|
|
@staticmethod
|
|
def load_dictionaries():
|
|
for t, d in Dictionary.dictionaries:
|
|
if t == 'chars':
|
|
if not mmseg.load_chars(d):
|
|
raise IOError("Cannot open '%s'" % d)
|
|
elif t == 'words':
|
|
if not mmseg.load_words(d):
|
|
raise IOError("Cannot open '%s'" % d)
|
|
|
|
|
|
mmseg.dict_load_defaults = Dictionary.load_dictionaries
|
|
|
|
|
|
class Algorithm(object):
|
|
def __init__(self, text: str):
|
|
"""\
|
|
Create an Algorithm instance to segment text.
|
|
"""
|
|
self.text = text.encode('utf8')
|
|
# add a reference to prevent the string buffer from
|
|
# being GC-ed
|
|
self.algor = mmseg.Algorithm(text)
|
|
self.destroied = False
|
|
|
|
def __iter__(self):
|
|
"""\
|
|
Iterate through all tokens. Note the iteration has
|
|
side-effect: an Algorithm object can only be iterated
|
|
once.
|
|
"""
|
|
while True:
|
|
tk = self.next_token()
|
|
if tk is None:
|
|
raise StopIteration
|
|
yield tk
|
|
|
|
def next_token(self):
|
|
"""\
|
|
Get next token. When no token available, return None.
|
|
"""
|
|
if self.destroied:
|
|
return None
|
|
|
|
tk = self.algor.next_token()
|
|
if tk.length == 0:
|
|
# no token available, the algorithm object
|
|
# can be destroied
|
|
self._destroy()
|
|
return None
|
|
else:
|
|
return tk
|
|
|
|
def _destroy(self):
|
|
if not self.destroied:
|
|
self.destroied = True
|
|
|
|
def __del__(self):
|
|
self._destroy()
|
|
|
|
|
|
def profile(fn):
|
|
def wrapper(*args, **kwargs):
|
|
profiler = cProfile.Profile()
|
|
stream = StringIO()
|
|
profiler.enable()
|
|
try:
|
|
res = fn(*args, **kwargs)
|
|
finally:
|
|
profiler.disable()
|
|
stats = pstats.Stats(profiler, stream=stream)
|
|
stats.sort_stats('time')
|
|
print("", file=stream)
|
|
print("=" * 100, file=stream)
|
|
print("Stats:", file=stream)
|
|
stats.print_stats()
|
|
|
|
print("=" * 100, file=stream)
|
|
print("Callers:", file=stream)
|
|
stats.print_callers()
|
|
|
|
print("=" * 100, file=stream)
|
|
print("Callees:", file=stream)
|
|
stats.print_callees()
|
|
print(stream.getvalue(), file=sys.stderr)
|
|
stream.close()
|
|
return res
|
|
|
|
return wrapper
|
|
|
|
|
|
def print_usage():
|
|
print("""
|
|
mmseg Segment Chinese text. Read from stdin and print to stdout.
|
|
|
|
Options:
|
|
-h
|
|
--help Print this message
|
|
|
|
-s
|
|
--separator Select the separator of the segmented text. Default is space.
|
|
""")
|
|
sys.exit(0)
|
|
|
|
|
|
separator = " "
|
|
|
|
optlst, args = getopt.getopt(sys.argv[1:], 'hs:')
|
|
|
|
for opt, val in optlst:
|
|
if opt == '-h':
|
|
print_usage()
|
|
|
|
elif opt == '-s':
|
|
separator = val
|
|
|
|
# load default dictionaries
|
|
mmseg.dict_load_defaults()
|
|
|
|
|
|
def process_tokens(stdin, separator):
|
|
ret = ''
|
|
first = True
|
|
algor = Algorithm(stdin)
|
|
try:
|
|
for tk in algor:
|
|
if not first:
|
|
ret += separator
|
|
ret += tk.text
|
|
first = False
|
|
except RuntimeError:
|
|
pass
|
|
return ret
|
|
|
|
|
|
sys.stdout.write(process_tokens(sys.stdin.read(), separator))
|
|
sys.stdout.write('\n')
|