parent
fcb352b230
commit
51a6773135
@ -0,0 +1,35 @@
|
||||
package com.xjs.job.task.webmagic;
|
||||
|
||||
import com.ruoyi.common.core.domain.R;
|
||||
import com.xjs.business.webmagic.RemoteWebmagicWeiXinSouGouFeign;
|
||||
import com.xjs.job.aop.TaskLog;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
|
||||
/**
|
||||
* 微信公众号定时任务
|
||||
* @author xiejs
|
||||
* @since 2022-06-13
|
||||
*/
|
||||
@Component("OfficialAccountsTask")
|
||||
@Log4j2
|
||||
public class OfficialAccountsTask {
|
||||
@Resource
|
||||
private RemoteWebmagicWeiXinSouGouFeign remoteWebmagicWeiXinSouGouFeign;
|
||||
|
||||
/**
|
||||
* 爬虫 公众号 定时任务执行
|
||||
*/
|
||||
@TaskLog(name = "微信公众号爬虫任务")
|
||||
public void execute() {
|
||||
log.info("---------------爬虫-公众号定时任务Start-------------------");
|
||||
|
||||
R r = remoteWebmagicWeiXinSouGouFeign.WeiXinOfficialAccountsTaskForPRC();
|
||||
|
||||
log.info("爬虫-公众号定时任务结果:code={},msg={},data={}",r.getCode(),r.getMsg(),r.getData());
|
||||
log.info("---------------爬虫-公众号定时任务end---------------------");
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
package com.xjs.utils;
|
||||
|
||||
import cn.hutool.core.util.RandomUtil;
|
||||
|
||||
/**
|
||||
* 生成随机工具类
|
||||
*
|
||||
* @author xiejs
|
||||
* @since 2022-06-13
|
||||
*/
|
||||
public class RandomUtils {
|
||||
|
||||
private static String[] zm = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"};
|
||||
|
||||
|
||||
/**
|
||||
* 获取随机字母
|
||||
* @return
|
||||
*/
|
||||
public static String randomZm() {
|
||||
int i = RandomUtil.randomInt(0, 25);
|
||||
return zm[i];
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,33 @@
|
||||
package com.xjs.weixin.controller;
|
||||
|
||||
import com.ruoyi.common.core.domain.R;
|
||||
import com.xjs.weixin.task.OfficialAccountsTask;
|
||||
import io.swagger.annotations.Api;
|
||||
import io.swagger.annotations.ApiOperation;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
/**
|
||||
* 微信公众号controller
|
||||
* @author xiejs
|
||||
* @since 2022-06-13
|
||||
*/
|
||||
@RestController
|
||||
@RequestMapping("weixin_official_accounts")
|
||||
@Api(tags = "爬虫模块-微信公众号")
|
||||
public class OfficialAccountsController {
|
||||
|
||||
@Autowired
|
||||
private OfficialAccountsTask officialAccountsTask;
|
||||
|
||||
|
||||
//----------------------远程rpc调用---------------------------
|
||||
@GetMapping("taskForPRC")
|
||||
@ApiOperation("供定时任务服务RPC远程调用")
|
||||
public R WeiXinOfficialAccountsTaskForPRC() {
|
||||
officialAccountsTask.execute();
|
||||
return R.ok();
|
||||
}
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
package com.xjs.weixin.controller;
|
||||
|
||||
import com.xjs.weixin.task.OfficialAccountsTask;
|
||||
import io.swagger.annotations.Api;
|
||||
import io.swagger.annotations.ApiOperation;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
/**
|
||||
* @author xiejs
|
||||
* @since 2022-06-13
|
||||
*/
|
||||
@RequestMapping("test")
|
||||
@RestController
|
||||
@Api(tags = "测试")
|
||||
public class TestController {
|
||||
|
||||
@Autowired
|
||||
private OfficialAccountsTask officialAccountsTask;
|
||||
|
||||
|
||||
@GetMapping
|
||||
@ApiOperation("微信公众号")
|
||||
public String test() {
|
||||
officialAccountsTask.execute();
|
||||
return "success";
|
||||
}
|
||||
}
|
@ -0,0 +1,118 @@
|
||||
package com.xjs.weixin.task;
|
||||
|
||||
import com.ruoyi.common.core.constant.HttpStatus;
|
||||
import com.ruoyi.common.core.domain.R;
|
||||
import com.ruoyi.common.core.utils.StringUtils;
|
||||
import com.ruoyi.common.redis.service.RedisService;
|
||||
import com.ruoyi.system.api.RemoteConfigService;
|
||||
import com.xjs.annotation.ReptileLog;
|
||||
import com.xjs.weixin.webmagic.OfficialAccountsPipeline;
|
||||
import com.xjs.weixin.webmagic.OfficialAccountsProcessor;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
|
||||
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_OFFICIAL_COUNT;
|
||||
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_OFFICIAL_NAME;
|
||||
import static com.xjs.consts.ReptileConst.WEIXIN_OFFCIAL_URL;
|
||||
import static com.xjs.weixin.consts.WeiXinConst.CONFIG_KEY_OFFICIAL;
|
||||
import static com.xjs.weixin.consts.WeiXinConst.REDIS_KEY_OFFICIAL;
|
||||
|
||||
/**
|
||||
* 微信公众号定时任务
|
||||
*
|
||||
* @author xiejs
|
||||
* @since 2022-06-13
|
||||
*/
|
||||
@Component
|
||||
@SuppressWarnings("all")
|
||||
public class OfficialAccountsTask {
|
||||
@Autowired
|
||||
private OfficialAccountsProcessor officialAccountsProcessor;
|
||||
@Autowired
|
||||
private RedisService redisService;
|
||||
@Autowired
|
||||
private OfficialAccountsPipeline officialAccountsPipeline;
|
||||
@Resource
|
||||
private RemoteConfigService remoteConfigService;
|
||||
|
||||
//解决aop自调用不生成代理对象问题
|
||||
@Autowired
|
||||
private OfficialAccountsTask officialAccountsTask;
|
||||
|
||||
public void execute() {
|
||||
|
||||
List<String> names = this.convert();
|
||||
for (String name : names) {
|
||||
String url = WEIXIN_OFFCIAL_URL + name;
|
||||
|
||||
redisService.setCacheObject(REPTILE_WEIXIN_OFFICIAL_NAME,name);
|
||||
|
||||
Long aLong = officialAccountsTask.reptileWeiXinOfficialAccount(url);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ReptileLog(name = "微信公众号")
|
||||
public Long reptileWeiXinOfficialAccount(String url) {
|
||||
//执行爬虫
|
||||
Spider.create(officialAccountsProcessor)
|
||||
.addUrl(url)//设置爬取地址
|
||||
.thread(30)//设置爬取线程数
|
||||
.setScheduler(new QueueScheduler()
|
||||
.setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器
|
||||
//.setDownloader(downloader)//设置下载器
|
||||
.addPipeline(officialAccountsPipeline)//设置爬取之后的数据操作
|
||||
.run();//同步执行
|
||||
|
||||
Integer cache = redisService.getCacheObject(REPTILE_WEIXIN_OFFICIAL_COUNT);
|
||||
redisService.deleteObject(REPTILE_WEIXIN_OFFICIAL_COUNT);
|
||||
if (cache != null) {
|
||||
return Long.valueOf(cache);
|
||||
}
|
||||
return 0L;
|
||||
}
|
||||
|
||||
private List<String> convert() {
|
||||
String str = this.getConfigSetting();
|
||||
|
||||
if (StringUtils.isNotEmpty(str) && !str.contains(",")) {
|
||||
return Arrays.asList(str);
|
||||
}
|
||||
|
||||
if (str.contains(",")) {
|
||||
String[] split = str.split(",");
|
||||
return Arrays.asList(split);
|
||||
}
|
||||
|
||||
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取系统配置参数
|
||||
*
|
||||
* @return str
|
||||
*/
|
||||
private String getConfigSetting() {
|
||||
if (redisService.hasKey(REDIS_KEY_OFFICIAL)) {
|
||||
return redisService.getCacheObject(REDIS_KEY_OFFICIAL);
|
||||
}
|
||||
|
||||
R<String> r = remoteConfigService.getConfigKeyForRPC(CONFIG_KEY_OFFICIAL);
|
||||
|
||||
if (r.getCode() == HttpStatus.SUCCESS) {
|
||||
return r.getData();
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
@ -0,0 +1,185 @@
|
||||
package com.xjs.weixin.webmagic;
|
||||
|
||||
import cn.hutool.core.collection.CollUtil;
|
||||
import cn.hutool.core.date.DatePattern;
|
||||
import cn.hutool.core.date.DateUtil;
|
||||
import cn.hutool.core.util.RandomUtil;
|
||||
import com.ruoyi.common.core.constant.HttpStatus;
|
||||
import com.ruoyi.common.core.utils.StringUtils;
|
||||
import com.ruoyi.common.redis.service.RedisService;
|
||||
import com.ruoyi.system.api.RemoteConfigService;
|
||||
import com.xjs.common.util.WeiXinUtils;
|
||||
import com.xjs.utils.RandomUtils;
|
||||
import com.xjs.weixin.consts.WeiXinConst;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.apache.http.HttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import static com.xjs.weixin.consts.WeiXinConst.*;
|
||||
|
||||
/**
|
||||
* 微信公众号爬虫数据处理
|
||||
*
|
||||
* @author xiejs
|
||||
* @since 2022-06-13
|
||||
*/
|
||||
@Component
|
||||
@Log4j2
|
||||
@SuppressWarnings("all")
|
||||
public class OfficialAccountsPipeline implements Pipeline {
|
||||
@Autowired
|
||||
private RedisService redisService;
|
||||
@Resource
|
||||
private RemoteConfigService remoteConfigService;
|
||||
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
|
||||
List<String> linkList = resultItems.get("linkList");
|
||||
String title = resultItems.get("title");
|
||||
|
||||
if (CollUtil.isNotEmpty(linkList) && StringUtils.isNotEmpty(title)) {
|
||||
String appendPath = this.getAppendPath(title);
|
||||
|
||||
File file = new File(appendPath);
|
||||
if (file.exists()) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (String link : linkList) {
|
||||
InputStream inputStream = null;
|
||||
|
||||
// 创建GET请求
|
||||
CloseableHttpClient httpClient = HttpClients.createDefault();
|
||||
HttpGet httpGet = null;
|
||||
try {
|
||||
httpGet = new HttpGet(link);
|
||||
HttpResponse response = httpClient.execute(httpGet);
|
||||
if (response.getStatusLine().getStatusCode() == HttpStatus.SUCCESS) {
|
||||
inputStream = response.getEntity().getContent();
|
||||
|
||||
//文件小于30kb则不写入
|
||||
long contentLength = response.getEntity().getContentLength();
|
||||
long kb = contentLength / 1024;
|
||||
if (SIZE_KB > kb) {
|
||||
continue;
|
||||
}
|
||||
|
||||
//拼接文件后缀
|
||||
String suffix;
|
||||
if (link.contains(JPEG)) {
|
||||
suffix = JPEG;
|
||||
} else if (link.contains(JPG)) {
|
||||
suffix = JPG;
|
||||
} else if (link.contains(PNG)) {
|
||||
suffix = PNG;
|
||||
} else if (link.contains(GIF)) {
|
||||
continue;
|
||||
} else {
|
||||
suffix = JPG;
|
||||
}
|
||||
|
||||
String chars = "ABCDEFGHIZKLMNOPQRSTUVWXYZ";
|
||||
char c = chars.charAt((int) (Math.random() * 1));
|
||||
|
||||
|
||||
String fileName = RandomUtils.randomZm() + RandomUtil.randomLong(100000, 1000000) + DOT + suffix;
|
||||
|
||||
this.downloadPicture(inputStream, this.getPath(), fileName, title);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
try {
|
||||
if (httpGet != null) {
|
||||
httpGet.clone();
|
||||
}
|
||||
} catch (CloneNotSupportedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
try {
|
||||
httpClient.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
try {
|
||||
if (inputStream != null) {
|
||||
inputStream.close();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error(e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 链接url下载图片
|
||||
*
|
||||
* @param inputStream 输入流
|
||||
* @param path 磁盘地址
|
||||
* @param fileName 文件名称
|
||||
* @param title 标题名称
|
||||
*/
|
||||
private void downloadPicture(InputStream inputStream, String path, String fileName, String title) {
|
||||
|
||||
WeiXinUtils.downloadPicture(inputStream, path, fileName, title, this.getAppendPath(title));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 获取拼接后的磁盘路径
|
||||
*
|
||||
* @param title 拼接的最后的文件夹
|
||||
* @return str
|
||||
*/
|
||||
private String getAppendPath(String title) {
|
||||
title = WeiXinUtils.filterTitle(title);
|
||||
|
||||
return this.getPath() + File.separator + DateUtil.format(new Date(),
|
||||
DatePattern.NORM_MONTH_PATTERN) + File.separator
|
||||
+ DateUtil.format(new Date(), "dd") + "日" + File.separator + title;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 从缓存 -> 数据库 -> 内存 中获取磁盘地址
|
||||
*
|
||||
* @return 地址
|
||||
*/
|
||||
private String getPath() {
|
||||
//磁盘路径
|
||||
String path;
|
||||
//判断redis中是否存在
|
||||
Boolean hasKey = redisService.hasKey(REDIS_KEY);
|
||||
if (hasKey) {
|
||||
path = redisService.getCacheObject(REDIS_KEY);
|
||||
} else {
|
||||
String data = remoteConfigService.getConfigKeyForRPC(CONFIG_KEY).getData();
|
||||
if (StringUtils.isNotEmpty(data)) {
|
||||
path = data;
|
||||
} else {
|
||||
path = WeiXinConst.PATH;
|
||||
}
|
||||
}
|
||||
return path;
|
||||
}
|
||||
}
|
@ -0,0 +1,155 @@
|
||||
package com.xjs.weixin.webmagic;
|
||||
|
||||
import cn.hutool.core.collection.CollUtil;
|
||||
import cn.hutool.core.date.DateTime;
|
||||
import cn.hutool.core.date.DateUtil;
|
||||
import com.ruoyi.common.core.utils.StringUtils;
|
||||
import com.ruoyi.common.redis.service.RedisService;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.selector.Selectable;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_OFFICIAL_COUNT;
|
||||
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_OFFICIAL_NAME;
|
||||
import static com.xjs.consts.ReptileConst.WEIXIN_SOUGOU_URL;
|
||||
|
||||
/**
|
||||
* 微信公众号爬虫
|
||||
*
|
||||
* @author xiejs
|
||||
* @since 2022-06-13
|
||||
*/
|
||||
@Component
|
||||
public class OfficialAccountsProcessor implements PageProcessor {
|
||||
|
||||
@Autowired
|
||||
private RedisService redisService;
|
||||
|
||||
/**
|
||||
* 请求头key
|
||||
*/
|
||||
private static final String headerKey = "User-Agent";
|
||||
/**
|
||||
* 请求头value
|
||||
*/
|
||||
private static final String headerValue = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36";
|
||||
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
|
||||
try {
|
||||
Integer count = redisService.getCacheObject(REPTILE_WEIXIN_OFFICIAL_COUNT);
|
||||
if (count == null) {
|
||||
count = 0;
|
||||
}
|
||||
|
||||
List<Selectable> lis = page.getHtml().css(".news-box > .news-list2 > li").nodes();
|
||||
ArrayList<String> urls = new ArrayList<>();
|
||||
for (Selectable li : lis) {
|
||||
String href = li.css("dl > dd > a", "href").get();
|
||||
|
||||
Object cacheObject = redisService.getCacheObject(REPTILE_WEIXIN_OFFICIAL_NAME);
|
||||
String str = (String) cacheObject;
|
||||
if (StringUtils.isNotEmpty(str)) {
|
||||
String text = li.css(".txt-box > .tit > a > em", "text").get();
|
||||
String textA = li.css(".txt-box > .tit > a", "text").get();
|
||||
if (StringUtils.isNotEmpty(textA)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (str.equals(text)) {
|
||||
|
||||
//只爬取当天的文章
|
||||
String date = li.css("dl > dd > span > script").get();
|
||||
Pattern pattern = Pattern.compile("'(.*?)'");
|
||||
Matcher matcher = pattern.matcher(date);
|
||||
while (matcher.find()) {
|
||||
//拿到时间戳
|
||||
String word = matcher.group(1);
|
||||
|
||||
DateTime dateTime = DateUtil.date(Long.parseLong(word) * 1000);
|
||||
String dateStr = dateTime.toDateStr();
|
||||
|
||||
String nowDateStr = DateUtil.formatDate(new Date());
|
||||
|
||||
if (dateStr.equals(nowDateStr)) {
|
||||
urls.add(WEIXIN_SOUGOU_URL + href);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
redisService.deleteObject(REPTILE_WEIXIN_OFFICIAL_NAME);
|
||||
|
||||
}
|
||||
|
||||
page.addTargetRequests(urls);
|
||||
|
||||
String js = page.getHtml().get();
|
||||
|
||||
if (js.contains("window.location.replace(url)")) {
|
||||
String function = js.substring(js.indexOf("{") + 1, js.indexOf("}"));
|
||||
//System.out.println("function="+function);
|
||||
|
||||
//正则匹配 ' ' 里面的内容
|
||||
Pattern pattern = Pattern.compile("'(.*?)'");
|
||||
Matcher matcher = pattern.matcher(function);
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
while (matcher.find()) {
|
||||
String word = matcher.group(1);
|
||||
stringBuilder.append(word);
|
||||
}
|
||||
page.addTargetRequests(Collections.singletonList(stringBuilder.toString()));
|
||||
}
|
||||
|
||||
//获取图片url
|
||||
List<String> linkList = page.getHtml().css("img", "data-src").all();
|
||||
//去空
|
||||
linkList.removeIf(StringUtils::isBlank);
|
||||
|
||||
//获取标题
|
||||
String title = page.getHtml().css("#activity-name", "text").get();
|
||||
|
||||
if (StringUtils.isNotEmpty(title)) {
|
||||
page.putField("title", title);
|
||||
}
|
||||
if (CollUtil.isNotEmpty(linkList)) {
|
||||
page.putField("linkList", linkList);
|
||||
}
|
||||
|
||||
count = linkList.size();
|
||||
|
||||
redisService.setCacheObject(REPTILE_WEIXIN_OFFICIAL_COUNT, count);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
redisService.expire(REPTILE_WEIXIN_OFFICIAL_COUNT, 3, TimeUnit.HOURS);
|
||||
redisService.expire(REPTILE_WEIXIN_OFFICIAL_NAME, 3, TimeUnit.HOURS);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me()
|
||||
//.addHeader(headerKey, headerValue)
|
||||
.addHeader(headerKey, headerValue)
|
||||
.setCharset("utf8")//设置字符编码
|
||||
.setTimeOut(2000)//设置超时时间
|
||||
.setRetrySleepTime(100)//设置重试间隔时间
|
||||
.setCycleRetryTimes(10)//设置重试次数
|
||||
.setSleepTime(1)//设置两个页面之间的间隔时间
|
||||
;
|
||||
}
|
||||
}
|
Loading…
Reference in new issue