|
|
|
@ -0,0 +1,116 @@
|
|
|
|
|
package com.mashibing.strategy.utils;
|
|
|
|
|
|
|
|
|
|
import com.mashibing.common.constant.CacheConstant;
|
|
|
|
|
import com.mashibing.strategy.feignclient.CacheClient;
|
|
|
|
|
|
|
|
|
|
import java.util.HashMap;
|
|
|
|
|
import java.util.HashSet;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
import java.util.Set;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @author heqijun
|
|
|
|
|
* @ClassName: DirtyWordTree
|
|
|
|
|
* @Description: 敏感词树工具类
|
|
|
|
|
* @date 2025/6/8 21:23
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
public class DirtyWordTree {
|
|
|
|
|
|
|
|
|
|
private static final Map DIRTY_WORD_TREE = new HashMap<>();
|
|
|
|
|
|
|
|
|
|
static {
|
|
|
|
|
//通过SpringUtil,获取Spring中的CacheClient对象
|
|
|
|
|
CacheClient cacheClient = (CacheClient) StringUtil.getBeanByClass(CacheClient.class);
|
|
|
|
|
//调用缓存模块接口获取全部敏感词
|
|
|
|
|
Set<String> dirtyWords = cacheClient.smember(CacheConstant.DIRTY_WORD);
|
|
|
|
|
//构建敏感词树
|
|
|
|
|
buildTree(dirtyWords);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 构建敏感词树
|
|
|
|
|
* 基于dfa算法实现敏感词树
|
|
|
|
|
* 原理:整体结构是一个map,每个词的第一个字都是key,每个字key的value都是一个map
|
|
|
|
|
* 字对应的map里存【以当前字结尾是否是敏感词】和下一个字key和value的map
|
|
|
|
|
*
|
|
|
|
|
* @param dirtyWordsSet 敏感词列表
|
|
|
|
|
*/
|
|
|
|
|
private static void buildTree(Set<String> dirtyWordsSet) {
|
|
|
|
|
Map<String, Map> currentMap;
|
|
|
|
|
//遍历每个词
|
|
|
|
|
for (String dirtyWord : dirtyWordsSet) {
|
|
|
|
|
//外层循环中每次都要指定当前map为最外层map
|
|
|
|
|
currentMap = DIRTY_WORD_TREE;
|
|
|
|
|
for (int i = 0; i < dirtyWord.length(); i++) {
|
|
|
|
|
//获取词中的每个字
|
|
|
|
|
String singleWord = String.valueOf(dirtyWord.charAt(i));
|
|
|
|
|
//如果当前字不在当前层的map中,则添加进去
|
|
|
|
|
if (!currentMap.containsKey(singleWord)) {
|
|
|
|
|
currentMap.put(singleWord, new HashMap());
|
|
|
|
|
}
|
|
|
|
|
//当前字对应的map
|
|
|
|
|
Map currentWordMap = currentMap.get(singleWord);
|
|
|
|
|
//如果当前字的map中妹有isEnd,说明这是刚添进去的字,要指定isEnd
|
|
|
|
|
if (!currentWordMap.containsKey("isEnd") && i < dirtyWord.length() - 1) {
|
|
|
|
|
//如果没到当前词末尾,说明不是敏感词,指定为false
|
|
|
|
|
currentWordMap.put("isEnd", false);
|
|
|
|
|
} else if (i == dirtyWord.length() - 1) {
|
|
|
|
|
//如果到了当前词末尾,说明是敏感词,指定为true
|
|
|
|
|
currentWordMap.put("isEnd", true);
|
|
|
|
|
}
|
|
|
|
|
//每个词的循环中,下一个字要进到下一层map中
|
|
|
|
|
currentMap = currentWordMap;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 匹配敏感词并返回结果
|
|
|
|
|
*
|
|
|
|
|
* @param text 要匹配的文本
|
|
|
|
|
* @return 匹配结果
|
|
|
|
|
*/
|
|
|
|
|
public static Set<String> getDirtyWord(String text) {
|
|
|
|
|
|
|
|
|
|
//拿到敏感词树
|
|
|
|
|
Map currentMap;
|
|
|
|
|
Set<String> result = new HashSet<>();
|
|
|
|
|
|
|
|
|
|
//遍历文本
|
|
|
|
|
for (int i = 0; i < text.length(); i++) {
|
|
|
|
|
currentMap = DIRTY_WORD_TREE;
|
|
|
|
|
//记录匹配上的敏感字的长度
|
|
|
|
|
int dirtyLength = 0;
|
|
|
|
|
boolean isDirty = false;
|
|
|
|
|
for (int j = i; j < text.length(); j++) {
|
|
|
|
|
//当前字
|
|
|
|
|
String currentWord = String.valueOf(text.charAt(j));
|
|
|
|
|
currentMap = (Map) currentMap.get(currentWord);
|
|
|
|
|
if (currentMap == null) {
|
|
|
|
|
//没匹配上,直接break
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
//匹配上了
|
|
|
|
|
dirtyLength++;
|
|
|
|
|
//判断是否是敏感词,不是的话,继续下一个字
|
|
|
|
|
if ((Boolean) currentMap.get("isEnd")) {
|
|
|
|
|
//是敏感词,退出循环,记录匹配到的敏感词
|
|
|
|
|
isDirty = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (isDirty) {
|
|
|
|
|
//记录匹配到的敏感词
|
|
|
|
|
result.add(text.substring(i, i + dirtyLength));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private DirtyWordTree() {
|
|
|
|
|
}
|
|
|
|
|
}
|