实现微信公众号定时爬虫任务

pull/254/head
xjs 3 years ago
parent fcb352b230
commit 51a6773135

@ -18,4 +18,7 @@ public interface RemoteWebmagicWeiXinSouGouFeign {
@GetMapping("/weixin_sougou/taskForPRC")
R WeiXinSouGouTaskForPRC() ;
@GetMapping("/weixin_official_accounts/taskForPRC")
R WeiXinOfficialAccountsTaskForPRC() ;
}

@ -22,6 +22,12 @@ public class RemoteWebmagicWeiXinSouGouFactory implements FallbackFactory<Remote
log.error("微信搜狗 爬虫定时任务 降级------服务可能正在运行");
return R.fail("降级处理------服务可能正在运行");
}
@Override
public R WeiXinOfficialAccountsTaskForPRC() {
log.error("微信公众号 爬虫定时任务 降级------服务可能正在运行");
return R.fail("降级处理------服务可能正在运行");
}
};
}
}

@ -0,0 +1,35 @@
package com.xjs.job.task.webmagic;
import com.ruoyi.common.core.domain.R;
import com.xjs.business.webmagic.RemoteWebmagicWeiXinSouGouFeign;
import com.xjs.job.aop.TaskLog;
import lombok.extern.log4j.Log4j2;
import org.springframework.stereotype.Component;
import javax.annotation.Resource;
/**
*
* @author xiejs
* @since 2022-06-13
*/
@Component("OfficialAccountsTask")
@Log4j2
public class OfficialAccountsTask {
@Resource
private RemoteWebmagicWeiXinSouGouFeign remoteWebmagicWeiXinSouGouFeign;
/**
*
*/
@TaskLog(name = "微信公众号爬虫任务")
public void execute() {
log.info("---------------爬虫-公众号定时任务Start-------------------");
R r = remoteWebmagicWeiXinSouGouFeign.WeiXinOfficialAccountsTaskForPRC();
log.info("爬虫-公众号定时任务结果:code={},msg={},data={}",r.getCode(),r.getMsg(),r.getData());
log.info("---------------爬虫-公众号定时任务end---------------------");
}
}

@ -160,7 +160,7 @@
<el-input v-model="form.configKey" placeholder="请输入参数键名" />
</el-form-item>
<el-form-item label="参数键值" prop="configValue">
<el-input v-model="form.configValue" placeholder="请输入参数键值" />
<el-input type="textarea" :rows="4" v-model="form.configValue" placeholder="请输入参数键值" />
</el-form-item>
<el-form-item label="系统内置" prop="configType">
<el-radio-group v-model="form.configType">

@ -60,6 +60,16 @@ public class RedisConst {
*/
public static final String REPTILE_WEIXIN_LINK_COUNT = "bussiness:reptile:weixin.link.count";
/**
* weixin.official
*/
public static final String REPTILE_WEIXIN_OFFICIAL_COUNT = "bussiness:reptile:weixin.official.count";
/**
* :temp:official_accounts:name
*/
public static final String REPTILE_WEIXIN_OFFICIAL_NAME = "temp:official_accounts:name";
/**
*zol.phone
*/

@ -40,6 +40,9 @@ public class ReptileConst {
*/
public static final String WEIXIN_SOUGOU_URL= "https://weixin.sogou.com/";
public static final String WEIXIN_OFFCIAL_URL= "https://weixin.sogou.com/weixin?type=1&s_from=input&query=";
/**
* url
*/

@ -0,0 +1,25 @@
package com.xjs.utils;
import cn.hutool.core.util.RandomUtil;
/**
*
*
* @author xiejs
* @since 2022-06-13
*/
public class RandomUtils {
private static String[] zm = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"};
/**
*
* @return
*/
public static String randomZm() {
int i = RandomUtil.randomInt(0, 25);
return zm[i];
}
}

@ -0,0 +1,89 @@
package com.xjs.common.util;
import java.io.*;
/**
*
* @author xiejs
* @since 2022-06-13
*/
public class WeiXinUtils {
public static String filterTitle(String title) {
//过滤title字段
title = title.replace(" ", "");
//替换\ 防止报错
if (title.contains("/")) {
title = title.replace("/", "-");
}
if (title.contains("\\")) {
title = title.replace("\\", "-");
}
if (title.contains(":")) {
title = title.replace(":", "-");
}
if (title.contains("*")) {
title = title.replace("*", "-");
}
if (title.contains("?")) {
title = title.replace("?", "-");
}
if (title.contains("\"")) {
title = title.replace("\"", "-");
}
if (title.contains("<")) {
title = title.replace("<", "-");
}
if (title.contains(">")) {
title = title.replace(">", "-");
}
if (title.contains("|")) {
title = title.replace("|", "-");
}
return title;
}
/**
* url
*
* @param inputStream
* @param path
* @param fileName
* @param title
* @param appendPath
*/
public static void downloadPicture(InputStream inputStream, String path, String fileName, String title,String appendPath) {
try {
DataInputStream dataInputStream = new DataInputStream(inputStream);
//如果文件夹不存在则创建
File file = new File(appendPath);
if (!file.exists()) {
boolean mkdirs = file.mkdirs();
}
String absolutePath = file.getAbsolutePath();
String absolute = absolutePath + File.separator + fileName;
FileOutputStream f = new FileOutputStream(absolute);
ByteArrayOutputStream out = new ByteArrayOutputStream();
byte[] bf = new byte[1024];
int length;
while ((length = dataInputStream.read(bf)) > 0) {
out.write(bf, 0, length);
}
f.write(out.toByteArray());
dataInputStream.close();
f.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

@ -17,11 +17,18 @@ public class WeiXinConst {
*/
public static final String REDIS_KEY = "sys_config:xjs.webmagic.wechatPicture";
public static final String REDIS_KEY_OFFICIAL = "sys_config:xjs.webmagic.official_accounts";
/**
* key
*/
public static final String CONFIG_KEY = "xjs.webmagic.wechatPicture";
public static final String CONFIG_KEY_OFFICIAL = "xjs:webmagic:official_accounts";
public static final String JPEG = "jpeg";
public static final String JPG = "jpg";

@ -0,0 +1,33 @@
package com.xjs.weixin.controller;
import com.ruoyi.common.core.domain.R;
import com.xjs.weixin.task.OfficialAccountsTask;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
/**
* controller
* @author xiejs
* @since 2022-06-13
*/
@RestController
@RequestMapping("weixin_official_accounts")
@Api(tags = "爬虫模块-微信公众号")
public class OfficialAccountsController {
@Autowired
private OfficialAccountsTask officialAccountsTask;
//----------------------远程rpc调用---------------------------
@GetMapping("taskForPRC")
@ApiOperation("供定时任务服务RPC远程调用")
public R WeiXinOfficialAccountsTaskForPRC() {
officialAccountsTask.execute();
return R.ok();
}
}

@ -0,0 +1,30 @@
package com.xjs.weixin.controller;
import com.xjs.weixin.task.OfficialAccountsTask;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
/**
* @author xiejs
* @since 2022-06-13
*/
@RequestMapping("test")
@RestController
@Api(tags = "测试")
public class TestController {
@Autowired
private OfficialAccountsTask officialAccountsTask;
@GetMapping
@ApiOperation("微信公众号")
public String test() {
officialAccountsTask.execute();
return "success";
}
}

@ -0,0 +1,118 @@
package com.xjs.weixin.task;
import com.ruoyi.common.core.constant.HttpStatus;
import com.ruoyi.common.core.domain.R;
import com.ruoyi.common.core.utils.StringUtils;
import com.ruoyi.common.redis.service.RedisService;
import com.ruoyi.system.api.RemoteConfigService;
import com.xjs.annotation.ReptileLog;
import com.xjs.weixin.webmagic.OfficialAccountsPipeline;
import com.xjs.weixin.webmagic.OfficialAccountsProcessor;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import javax.annotation.Resource;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_OFFICIAL_COUNT;
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_OFFICIAL_NAME;
import static com.xjs.consts.ReptileConst.WEIXIN_OFFCIAL_URL;
import static com.xjs.weixin.consts.WeiXinConst.CONFIG_KEY_OFFICIAL;
import static com.xjs.weixin.consts.WeiXinConst.REDIS_KEY_OFFICIAL;
/**
*
*
* @author xiejs
* @since 2022-06-13
*/
@Component
@SuppressWarnings("all")
public class OfficialAccountsTask {
@Autowired
private OfficialAccountsProcessor officialAccountsProcessor;
@Autowired
private RedisService redisService;
@Autowired
private OfficialAccountsPipeline officialAccountsPipeline;
@Resource
private RemoteConfigService remoteConfigService;
//解决aop自调用不生成代理对象问题
@Autowired
private OfficialAccountsTask officialAccountsTask;
public void execute() {
List<String> names = this.convert();
for (String name : names) {
String url = WEIXIN_OFFCIAL_URL + name;
redisService.setCacheObject(REPTILE_WEIXIN_OFFICIAL_NAME,name);
Long aLong = officialAccountsTask.reptileWeiXinOfficialAccount(url);
}
}
@ReptileLog(name = "微信公众号")
public Long reptileWeiXinOfficialAccount(String url) {
//执行爬虫
Spider.create(officialAccountsProcessor)
.addUrl(url)//设置爬取地址
.thread(30)//设置爬取线程数
.setScheduler(new QueueScheduler()
.setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器
//.setDownloader(downloader)//设置下载器
.addPipeline(officialAccountsPipeline)//设置爬取之后的数据操作
.run();//同步执行
Integer cache = redisService.getCacheObject(REPTILE_WEIXIN_OFFICIAL_COUNT);
redisService.deleteObject(REPTILE_WEIXIN_OFFICIAL_COUNT);
if (cache != null) {
return Long.valueOf(cache);
}
return 0L;
}
private List<String> convert() {
String str = this.getConfigSetting();
if (StringUtils.isNotEmpty(str) && !str.contains(",")) {
return Arrays.asList(str);
}
if (str.contains(",")) {
String[] split = str.split(",");
return Arrays.asList(split);
}
return new ArrayList<>();
}
/**
*
*
* @return str
*/
private String getConfigSetting() {
if (redisService.hasKey(REDIS_KEY_OFFICIAL)) {
return redisService.getCacheObject(REDIS_KEY_OFFICIAL);
}
R<String> r = remoteConfigService.getConfigKeyForRPC(CONFIG_KEY_OFFICIAL);
if (r.getCode() == HttpStatus.SUCCESS) {
return r.getData();
}
return null;
}
}

@ -0,0 +1,185 @@
package com.xjs.weixin.webmagic;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.date.DatePattern;
import cn.hutool.core.date.DateUtil;
import cn.hutool.core.util.RandomUtil;
import com.ruoyi.common.core.constant.HttpStatus;
import com.ruoyi.common.core.utils.StringUtils;
import com.ruoyi.common.redis.service.RedisService;
import com.ruoyi.system.api.RemoteConfigService;
import com.xjs.common.util.WeiXinUtils;
import com.xjs.utils.RandomUtils;
import com.xjs.weixin.consts.WeiXinConst;
import lombok.extern.log4j.Log4j2;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import javax.annotation.Resource;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.List;
import static com.xjs.weixin.consts.WeiXinConst.*;
/**
*
*
* @author xiejs
* @since 2022-06-13
*/
@Component
@Log4j2
@SuppressWarnings("all")
public class OfficialAccountsPipeline implements Pipeline {
@Autowired
private RedisService redisService;
@Resource
private RemoteConfigService remoteConfigService;
@Override
public void process(ResultItems resultItems, Task task) {
List<String> linkList = resultItems.get("linkList");
String title = resultItems.get("title");
if (CollUtil.isNotEmpty(linkList) && StringUtils.isNotEmpty(title)) {
String appendPath = this.getAppendPath(title);
File file = new File(appendPath);
if (file.exists()) {
return;
}
for (String link : linkList) {
InputStream inputStream = null;
// 创建GET请求
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = null;
try {
httpGet = new HttpGet(link);
HttpResponse response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == HttpStatus.SUCCESS) {
inputStream = response.getEntity().getContent();
//文件小于30kb则不写入
long contentLength = response.getEntity().getContentLength();
long kb = contentLength / 1024;
if (SIZE_KB > kb) {
continue;
}
//拼接文件后缀
String suffix;
if (link.contains(JPEG)) {
suffix = JPEG;
} else if (link.contains(JPG)) {
suffix = JPG;
} else if (link.contains(PNG)) {
suffix = PNG;
} else if (link.contains(GIF)) {
continue;
} else {
suffix = JPG;
}
String chars = "ABCDEFGHIZKLMNOPQRSTUVWXYZ";
char c = chars.charAt((int) (Math.random() * 1));
String fileName = RandomUtils.randomZm() + RandomUtil.randomLong(100000, 1000000) + DOT + suffix;
this.downloadPicture(inputStream, this.getPath(), fileName, title);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (httpGet != null) {
httpGet.clone();
}
} catch (CloneNotSupportedException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
if (inputStream != null) {
inputStream.close();
}
} catch (IOException e) {
log.error(e.getMessage());
}
}
}
}
}
/**
* url
*
* @param inputStream
* @param path
* @param fileName
* @param title
*/
private void downloadPicture(InputStream inputStream, String path, String fileName, String title) {
WeiXinUtils.downloadPicture(inputStream, path, fileName, title, this.getAppendPath(title));
}
/**
*
*
* @param title
* @return str
*/
private String getAppendPath(String title) {
title = WeiXinUtils.filterTitle(title);
return this.getPath() + File.separator + DateUtil.format(new Date(),
DatePattern.NORM_MONTH_PATTERN) + File.separator
+ DateUtil.format(new Date(), "dd") + "日" + File.separator + title;
}
/**
* -> ->
*
* @return
*/
private String getPath() {
//磁盘路径
String path;
//判断redis中是否存在
Boolean hasKey = redisService.hasKey(REDIS_KEY);
if (hasKey) {
path = redisService.getCacheObject(REDIS_KEY);
} else {
String data = remoteConfigService.getConfigKeyForRPC(CONFIG_KEY).getData();
if (StringUtils.isNotEmpty(data)) {
path = data;
} else {
path = WeiXinConst.PATH;
}
}
return path;
}
}

@ -0,0 +1,155 @@
package com.xjs.weixin.webmagic;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.date.DateTime;
import cn.hutool.core.date.DateUtil;
import com.ruoyi.common.core.utils.StringUtils;
import com.ruoyi.common.redis.service.RedisService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_OFFICIAL_COUNT;
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_OFFICIAL_NAME;
import static com.xjs.consts.ReptileConst.WEIXIN_SOUGOU_URL;
/**
*
*
* @author xiejs
* @since 2022-06-13
*/
@Component
public class OfficialAccountsProcessor implements PageProcessor {
@Autowired
private RedisService redisService;
/**
* key
*/
private static final String headerKey = "User-Agent";
/**
* value
*/
private static final String headerValue = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36";
@Override
public void process(Page page) {
try {
Integer count = redisService.getCacheObject(REPTILE_WEIXIN_OFFICIAL_COUNT);
if (count == null) {
count = 0;
}
List<Selectable> lis = page.getHtml().css(".news-box > .news-list2 > li").nodes();
ArrayList<String> urls = new ArrayList<>();
for (Selectable li : lis) {
String href = li.css("dl > dd > a", "href").get();
Object cacheObject = redisService.getCacheObject(REPTILE_WEIXIN_OFFICIAL_NAME);
String str = (String) cacheObject;
if (StringUtils.isNotEmpty(str)) {
String text = li.css(".txt-box > .tit > a > em", "text").get();
String textA = li.css(".txt-box > .tit > a", "text").get();
if (StringUtils.isNotEmpty(textA)) {
continue;
}
if (str.equals(text)) {
//只爬取当天的文章
String date = li.css("dl > dd > span > script").get();
Pattern pattern = Pattern.compile("'(.*?)'");
Matcher matcher = pattern.matcher(date);
while (matcher.find()) {
//拿到时间戳
String word = matcher.group(1);
DateTime dateTime = DateUtil.date(Long.parseLong(word) * 1000);
String dateStr = dateTime.toDateStr();
String nowDateStr = DateUtil.formatDate(new Date());
if (dateStr.equals(nowDateStr)) {
urls.add(WEIXIN_SOUGOU_URL + href);
}
}
}
}
redisService.deleteObject(REPTILE_WEIXIN_OFFICIAL_NAME);
}
page.addTargetRequests(urls);
String js = page.getHtml().get();
if (js.contains("window.location.replace(url)")) {
String function = js.substring(js.indexOf("{") + 1, js.indexOf("}"));
//System.out.println("function="+function);
//正则匹配 ' ' 里面的内容
Pattern pattern = Pattern.compile("'(.*?)'");
Matcher matcher = pattern.matcher(function);
StringBuilder stringBuilder = new StringBuilder();
while (matcher.find()) {
String word = matcher.group(1);
stringBuilder.append(word);
}
page.addTargetRequests(Collections.singletonList(stringBuilder.toString()));
}
//获取图片url
List<String> linkList = page.getHtml().css("img", "data-src").all();
//去空
linkList.removeIf(StringUtils::isBlank);
//获取标题
String title = page.getHtml().css("#activity-name", "text").get();
if (StringUtils.isNotEmpty(title)) {
page.putField("title", title);
}
if (CollUtil.isNotEmpty(linkList)) {
page.putField("linkList", linkList);
}
count = linkList.size();
redisService.setCacheObject(REPTILE_WEIXIN_OFFICIAL_COUNT, count);
} catch (Exception e) {
e.printStackTrace();
} finally {
redisService.expire(REPTILE_WEIXIN_OFFICIAL_COUNT, 3, TimeUnit.HOURS);
redisService.expire(REPTILE_WEIXIN_OFFICIAL_NAME, 3, TimeUnit.HOURS);
}
}
@Override
public Site getSite() {
return Site.me()
//.addHeader(headerKey, headerValue)
.addHeader(headerKey, headerValue)
.setCharset("utf8")//设置字符编码
.setTimeOut(2000)//设置超时时间
.setRetrySleepTime(100)//设置重试间隔时间
.setCycleRetryTimes(10)//设置重试次数
.setSleepTime(1)//设置两个页面之间的间隔时间
;
}
}

@ -6,6 +6,7 @@ import com.ruoyi.common.core.constant.HttpStatus;
import com.ruoyi.common.core.utils.StringUtils;
import com.ruoyi.common.redis.service.RedisService;
import com.ruoyi.system.api.RemoteConfigService;
import com.xjs.common.util.WeiXinUtils;
import com.xjs.weixin.consts.WeiXinConst;
import lombok.extern.log4j.Log4j2;
import org.apache.http.HttpResponse;
@ -43,9 +44,6 @@ public class WeiXinLinkPipeline implements Pipeline {
private RemoteConfigService remoteConfigService;
@Override
public void process(ResultItems resultItems, Task task) {
@ -134,79 +132,18 @@ public class WeiXinLinkPipeline implements Pipeline {
*/
private void downloadPicture(InputStream inputStream, String path, String fileName, String title) {
try {
DataInputStream dataInputStream = new DataInputStream(inputStream);
//拼接文件路径
String appendPath = this.getAppendPath(title);
//如果文件夹不存在则创建
File file = new File(appendPath);
if (!file.exists()) {
boolean mkdirs = file.mkdirs();
}
String absolutePath = file.getAbsolutePath();
String absolute = absolutePath + File.separator + fileName;
FileOutputStream f = new FileOutputStream(absolute);
ByteArrayOutputStream out = new ByteArrayOutputStream();
byte[] bf = new byte[1024];
int length;
while ((length = dataInputStream.read(bf)) > 0) {
out.write(bf, 0, length);
}
f.write(out.toByteArray());
dataInputStream.close();
f.close();
} catch (IOException e) {
e.printStackTrace();
}
WeiXinUtils.downloadPicture(inputStream, path, fileName, title, this.getAppendPath(title));
}
/**
*
*
* @param title
* @return str
*/
private String getAppendPath(String title) {
//过滤title字段
title = title.replace(" ", "");
//替换\ 防止报错
if (title.contains("/")) {
title = title.replace("/", "-");
}
if (title.contains("\\")) {
title = title.replace("\\", "-");
}
if (title.contains(":")) {
title = title.replace(":", "-");
}
if (title.contains("*")) {
title = title.replace("*", "-");
}
if (title.contains("?")) {
title = title.replace("?", "-");
}
if (title.contains("\"")) {
title = title.replace("\"", "-");
}
if (title.contains("<")) {
title = title.replace("<", "-");
}
if (title.contains(">")) {
title = title.replace(">", "-");
}
if (title.contains("|")) {
title = title.replace("|", "-");
}
title = WeiXinUtils.filterTitle(title);
return this.getPath() + File.separator + DateUtil.format(new Date(),
DatePattern.NORM_MONTH_PATTERN) + File.separator

Loading…
Cancel
Save