|
|
|
@ -1,15 +1,16 @@
|
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
# modify from https://sites.google.com/site/homepageoffuyanwei/Home/remarksandexcellentdiscussion/page-2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Word:
|
|
|
|
|
def __init__(self,text = '',freq = 0):
|
|
|
|
|
def __init__(self, text='', freq=0):
|
|
|
|
|
self.text = text
|
|
|
|
|
self.freq = freq
|
|
|
|
|
self.length = len(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Chunk:
|
|
|
|
|
def __init__(self,w1,w2 = None,w3 = None):
|
|
|
|
|
def __init__(self, w1, w2=None, w3=None):
|
|
|
|
|
self.words = []
|
|
|
|
|
self.words.append(w1)
|
|
|
|
|
if w2:
|
|
|
|
@ -44,8 +45,8 @@ class Chunk:
|
|
|
|
|
sum += word.freq
|
|
|
|
|
return sum
|
|
|
|
|
|
|
|
|
|
class ComplexCompare:
|
|
|
|
|
|
|
|
|
|
class ComplexCompare:
|
|
|
|
|
def takeHightest(self, chunks, comparator):
|
|
|
|
|
i = 1
|
|
|
|
|
for j in range(1, len(chunks)):
|
|
|
|
@ -59,23 +60,27 @@ class ComplexCompare:
|
|
|
|
|
|
|
|
|
|
#以下四个函数是mmseg算法的四种过滤原则,核心算法
|
|
|
|
|
def mmFilter(self, chunks):
|
|
|
|
|
def comparator(a,b):
|
|
|
|
|
def comparator(a, b):
|
|
|
|
|
return a.totalWordLength() - b.totalWordLength()
|
|
|
|
|
|
|
|
|
|
return self.takeHightest(chunks, comparator)
|
|
|
|
|
|
|
|
|
|
def lawlFilter(self,chunks):
|
|
|
|
|
def comparator(a,b):
|
|
|
|
|
def lawlFilter(self, chunks):
|
|
|
|
|
def comparator(a, b):
|
|
|
|
|
return a.averageWordLength() - b.averageWordLength()
|
|
|
|
|
return self.takeHightest(chunks,comparator)
|
|
|
|
|
|
|
|
|
|
def svmlFilter(self,chunks):
|
|
|
|
|
def comparator(a,b):
|
|
|
|
|
return self.takeHightest(chunks, comparator)
|
|
|
|
|
|
|
|
|
|
def svmlFilter(self, chunks):
|
|
|
|
|
def comparator(a, b):
|
|
|
|
|
return b.standardDeviation() - a.standardDeviation()
|
|
|
|
|
|
|
|
|
|
return self.takeHightest(chunks, comparator)
|
|
|
|
|
|
|
|
|
|
def logFreqFilter(self,chunks):
|
|
|
|
|
def comparator(a,b):
|
|
|
|
|
def logFreqFilter(self, chunks):
|
|
|
|
|
def comparator(a, b):
|
|
|
|
|
return a.wordFrequency() - b.wordFrequency()
|
|
|
|
|
|
|
|
|
|
return self.takeHightest(chunks, comparator)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -83,6 +88,7 @@ class ComplexCompare:
|
|
|
|
|
dictWord = {}
|
|
|
|
|
maxWordLength = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def loadDictChars(filepath):
|
|
|
|
|
global maxWordLength
|
|
|
|
|
fsock = open(filepath)
|
|
|
|
@ -90,18 +96,22 @@ def loadDictChars(filepath):
|
|
|
|
|
freq, word = line.split()
|
|
|
|
|
word = word.strip()
|
|
|
|
|
dictWord[word] = (len(word), int(freq))
|
|
|
|
|
maxWordLength = len(word) if maxWordLength < len(word) else maxWordLength
|
|
|
|
|
maxWordLength = len(word) if maxWordLength < len(
|
|
|
|
|
word) else maxWordLength
|
|
|
|
|
fsock.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def loadDictWords(filepath):
|
|
|
|
|
global maxWordLength
|
|
|
|
|
fsock = open(filepath)
|
|
|
|
|
for line in fsock.readlines():
|
|
|
|
|
word = line.strip()
|
|
|
|
|
dictWord[word] = (len(word), 0)
|
|
|
|
|
maxWordLength = len(word) if maxWordLength < len(word) else maxWordLength
|
|
|
|
|
maxWordLength = len(word) if maxWordLength < len(
|
|
|
|
|
word) else maxWordLength
|
|
|
|
|
fsock.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#判断该词word是否在字典dictWord中
|
|
|
|
|
def getDictWord(word):
|
|
|
|
|
result = dictWord.get(word)
|
|
|
|
@ -109,14 +119,15 @@ def getDictWord(word):
|
|
|
|
|
return Word(word, result[1])
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#开始加载字典
|
|
|
|
|
def run():
|
|
|
|
|
from os.path import join, dirname
|
|
|
|
|
loadDictChars(join(dirname(__file__), 'data', 'chars.dic'))
|
|
|
|
|
loadDictWords(join(dirname(__file__), 'data', 'words.dic'))
|
|
|
|
|
|
|
|
|
|
class Analysis:
|
|
|
|
|
|
|
|
|
|
class Analysis:
|
|
|
|
|
def __init__(self, text):
|
|
|
|
|
self.text = text
|
|
|
|
|
self.cacheSize = 3
|
|
|
|
@ -134,11 +145,10 @@ class Analysis:
|
|
|
|
|
if not dictWord:
|
|
|
|
|
run()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
|
while True:
|
|
|
|
|
token = self.getNextToken()
|
|
|
|
|
if token == None:
|
|
|
|
|
if token is None:
|
|
|
|
|
raise StopIteration
|
|
|
|
|
yield token
|
|
|
|
|
|
|
|
|
@ -146,7 +156,7 @@ class Analysis:
|
|
|
|
|
return self.text[self.pos]
|
|
|
|
|
|
|
|
|
|
#判断该字符是否是中文字符(不包括中文标点)
|
|
|
|
|
def isChineseChar(self,charater):
|
|
|
|
|
def isChineseChar(self, charater):
|
|
|
|
|
return 0x4e00 <= ord(charater) < 0x9fa6
|
|
|
|
|
|
|
|
|
|
#判断是否是ASCII码
|
|
|
|
@ -163,8 +173,8 @@ class Analysis:
|
|
|
|
|
while self.pos < self.textLength:
|
|
|
|
|
if self.isChineseChar(self.getNextChar()):
|
|
|
|
|
token = self.getChineseWords()
|
|
|
|
|
else :
|
|
|
|
|
token = self.getASCIIWords()+'/'
|
|
|
|
|
else:
|
|
|
|
|
token = self.getASCIIWords() + '/'
|
|
|
|
|
if len(token) > 0:
|
|
|
|
|
return token
|
|
|
|
|
return None
|
|
|
|
@ -211,7 +221,7 @@ class Analysis:
|
|
|
|
|
chunks = self.complexCompare.svmlFilter(chunks)
|
|
|
|
|
if len(chunks) > 1:
|
|
|
|
|
chunks = self.complexCompare.logFreqFilter(chunks)
|
|
|
|
|
if len(chunks) == 0 :
|
|
|
|
|
if len(chunks) == 0:
|
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
#最后只有一种切割方法
|
|
|
|
@ -242,13 +252,13 @@ class Analysis:
|
|
|
|
|
for word3 in words3:
|
|
|
|
|
# print(word3.length, word3.text)
|
|
|
|
|
if word3.length == -1:
|
|
|
|
|
chunk = Chunk(word1,word2)
|
|
|
|
|
chunk = Chunk(word1, word2)
|
|
|
|
|
# print("Ture")
|
|
|
|
|
else :
|
|
|
|
|
chunk = Chunk(word1,word2,word3)
|
|
|
|
|
else:
|
|
|
|
|
chunk = Chunk(word1, word2, word3)
|
|
|
|
|
chunks.append(chunk)
|
|
|
|
|
elif self.pos == self.textLength:
|
|
|
|
|
chunks.append(Chunk(word1,word2))
|
|
|
|
|
chunks.append(Chunk(word1, word2))
|
|
|
|
|
self.pos -= len(word2.text)
|
|
|
|
|
elif self.pos == self.textLength:
|
|
|
|
|
chunks.append(Chunk(word1))
|
|
|
|
@ -268,7 +278,7 @@ class Analysis:
|
|
|
|
|
words = []
|
|
|
|
|
index = 0
|
|
|
|
|
while self.pos < self.textLength:
|
|
|
|
|
if index >= maxWordLength :
|
|
|
|
|
if index >= maxWordLength:
|
|
|
|
|
break
|
|
|
|
|
if not self.isChineseChar(self.getNextChar()):
|
|
|
|
|
break
|
|
|
|
@ -288,18 +298,18 @@ class Analysis:
|
|
|
|
|
word.text = 'X'
|
|
|
|
|
words.append(word)
|
|
|
|
|
|
|
|
|
|
self.cache[self.cacheIndex] = (self.pos,words)
|
|
|
|
|
self.cache[self.cacheIndex] = (self.pos, words)
|
|
|
|
|
self.cacheIndex += 1
|
|
|
|
|
if self.cacheIndex >= self.cacheSize:
|
|
|
|
|
self.cacheIndex = 0
|
|
|
|
|
return words
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__=="__main__":
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
def cuttest(text):
|
|
|
|
|
#cut = Analysis(text)
|
|
|
|
|
tmp=""
|
|
|
|
|
tmp = ""
|
|
|
|
|
try:
|
|
|
|
|
for word in iter(Analysis(text)):
|
|
|
|
|
tmp += word
|
|
|
|
@ -375,6 +385,8 @@ if __name__=="__main__":
|
|
|
|
|
cuttest(u"好人使用了它就可以解决一些问题")
|
|
|
|
|
cuttest(u"是因为和国家")
|
|
|
|
|
cuttest(u"老年搜索还支持")
|
|
|
|
|
cuttest(u"干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
|
|
|
|
cuttest(
|
|
|
|
|
u"干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 "
|
|
|
|
|
)
|
|
|
|
|
cuttest("2022年12月30日是星期几?")
|
|
|
|
|
cuttest("二零二二年十二月三十日是星期几?")
|