parent
366685379a
commit
59400b2e3e
@ -0,0 +1,34 @@
|
||||
package com.xjs.weixin.consts;
|
||||
|
||||
/**
|
||||
* 微信爬虫常量
|
||||
*
|
||||
* @author xiejs
|
||||
* @since 2022-03-17
|
||||
*/
|
||||
public class WeiXinConst {
|
||||
/**
|
||||
* 磁盘默认地址
|
||||
*/
|
||||
public static final String PATH = "D:\\Dev\\WebCrawler\\Wechat";
|
||||
|
||||
/**
|
||||
* redis的key
|
||||
*/
|
||||
public static final String REDIS_KEY = "sys_config:xjs.webmagic.wechatPicture";
|
||||
|
||||
/**
|
||||
* 系统配置表中的key
|
||||
*/
|
||||
public static final String CONFIG_KEY = "xjs.webmagic.wechatPicture";
|
||||
|
||||
public static final String JPEG = "jpeg";
|
||||
|
||||
public static final String JPG = "jpg";
|
||||
|
||||
public static final String PNG = "png";
|
||||
|
||||
public static final String DOT = ".";
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,37 @@
|
||||
package com.xjs.weixin.controller;
|
||||
|
||||
import com.ruoyi.common.core.web.controller.BaseController;
|
||||
import com.ruoyi.common.core.web.domain.AjaxResult;
|
||||
import com.ruoyi.common.security.annotation.RequiresPermissions;
|
||||
import com.xjs.weixin.service.WeiXinLinkService;
|
||||
import io.swagger.annotations.Api;
|
||||
import io.swagger.annotations.ApiOperation;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
/**
|
||||
* 微信文章链接控制器
|
||||
*
|
||||
* @author xiejs
|
||||
* @since 2022-03-17
|
||||
*/
|
||||
@RestController
|
||||
@RequestMapping("weixin_link")
|
||||
@Api(tags = "爬虫模块-微信链接")
|
||||
public class WeiXinLinkController extends BaseController {
|
||||
|
||||
@Autowired
|
||||
private WeiXinLinkService weiXInLinkService;
|
||||
|
||||
@RequiresPermissions("webmagic:weixinlink:get")
|
||||
@GetMapping("/getPicture")
|
||||
@ApiOperation("获取文章图片")
|
||||
public AjaxResult getPicture(@RequestParam("link") String link) {
|
||||
Boolean flag = weiXInLinkService.getPicture(link);
|
||||
return toAjax(flag);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,16 @@
|
||||
package com.xjs.weixin.service;
|
||||
|
||||
/**
|
||||
* 微信文章链接service接口
|
||||
* @author xiejs
|
||||
* @since 2022-03-17
|
||||
*/
|
||||
public interface WeiXinLinkService {
|
||||
|
||||
/**
|
||||
* 爬虫获取微信文章图片
|
||||
* @return 布尔
|
||||
* @param link 链接地址
|
||||
*/
|
||||
Boolean getPicture(String link);
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
package com.xjs.weixin.service.impl;
|
||||
|
||||
import com.xjs.weixin.service.WeiXinLinkService;
|
||||
import com.xjs.weixin.task.WeiXinLinkTask;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
/**
|
||||
* 微信文章链接service接口实现
|
||||
*
|
||||
* @author xiejs
|
||||
* @since 2022-03-17
|
||||
*/
|
||||
@Service
|
||||
public class WeiXinLinkServiceImpl implements WeiXinLinkService {
|
||||
|
||||
@Autowired
|
||||
private WeiXinLinkTask weiXinLinkTask;
|
||||
|
||||
@Override
|
||||
public Boolean getPicture(String link) {
|
||||
Long count = weiXinLinkTask.reptileWeiXinLink(link);
|
||||
return count != 0L;
|
||||
}
|
||||
}
|
@ -0,0 +1,52 @@
|
||||
package com.xjs.weixin.task;
|
||||
|
||||
import com.ruoyi.common.redis.service.RedisService;
|
||||
import com.xjs.annotation.ReptileLog;
|
||||
import com.xjs.weixin.webmagic.WeiXinLinkPipeline;
|
||||
import com.xjs.weixin.webmagic.WeiXinLinkProcessor;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
|
||||
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
||||
|
||||
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_LINK_COUNT;
|
||||
|
||||
/**
|
||||
* 微信链接爬虫任务
|
||||
* @author xiejs
|
||||
* @since 2022-03-17
|
||||
*/
|
||||
@Component
|
||||
@Log4j2
|
||||
public class WeiXinLinkTask {
|
||||
|
||||
@Autowired
|
||||
private WeiXinLinkProcessor weiXinLinkProcessor;
|
||||
@Autowired
|
||||
private WeiXinLinkPipeline weiXinLinkPipeline;
|
||||
@Autowired
|
||||
private RedisService redisService;
|
||||
|
||||
@ReptileLog(name = "微信链接", url = "###")
|
||||
public Long reptileWeiXinLink(String link) {
|
||||
//执行爬虫
|
||||
Spider.create(weiXinLinkProcessor)
|
||||
.addUrl(link)//设置爬取地址
|
||||
.thread(30)//设置爬取线程数
|
||||
.setScheduler(new QueueScheduler()
|
||||
.setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器
|
||||
.addPipeline(weiXinLinkPipeline)//设置爬取之后的数据操作
|
||||
//.setDownloader(downloader)//设置下载器
|
||||
.run();//执行
|
||||
|
||||
Integer cache = redisService.getCacheObject(REPTILE_WEIXIN_LINK_COUNT);
|
||||
redisService.deleteObject(REPTILE_WEIXIN_LINK_COUNT);
|
||||
if (cache != null) {
|
||||
return Long.valueOf(cache);
|
||||
}
|
||||
return 0L;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,168 @@
|
||||
package com.xjs.weixin.webmagic;
|
||||
|
||||
import cn.hutool.core.date.DatePattern;
|
||||
import cn.hutool.core.date.DateUtil;
|
||||
import com.ruoyi.common.core.constant.HttpStatus;
|
||||
import com.ruoyi.common.core.utils.StringUtils;
|
||||
import com.ruoyi.common.redis.service.RedisService;
|
||||
import com.ruoyi.system.api.RemoteConfigService;
|
||||
import com.xjs.weixin.consts.WeiXinConst;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.apache.http.HttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import java.io.*;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
import static com.xjs.weixin.consts.WeiXinConst.*;
|
||||
|
||||
|
||||
/**
|
||||
* 微信链接爬虫数据处理
|
||||
*
|
||||
* @author xiejs
|
||||
* @since 2022-03-17
|
||||
*/
|
||||
@Component
|
||||
@Log4j2
|
||||
public class WeiXinLinkPipeline implements Pipeline {
|
||||
|
||||
@Autowired
|
||||
private RedisService redisService;
|
||||
@Resource
|
||||
private RemoteConfigService remoteConfigService;
|
||||
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
|
||||
|
||||
List<String> linkList = resultItems.get("linkList");
|
||||
for (String link : linkList) {
|
||||
|
||||
// 创建GET请求
|
||||
CloseableHttpClient httpClient = HttpClients.createDefault();
|
||||
HttpGet httpGet = null;
|
||||
InputStream inputStream = null;
|
||||
try {
|
||||
httpGet = new HttpGet(link);
|
||||
HttpResponse response = httpClient.execute(httpGet);
|
||||
if (response.getStatusLine().getStatusCode() == HttpStatus.SUCCESS) {
|
||||
inputStream = response.getEntity().getContent();
|
||||
|
||||
String suffix;
|
||||
if (link.contains(JPEG)) {
|
||||
suffix = JPEG;
|
||||
} else if (link.contains(JPG)) {
|
||||
suffix = JPG;
|
||||
} else if (link.contains(PNG)) {
|
||||
suffix = PNG;
|
||||
} else {
|
||||
suffix = JPG;
|
||||
}
|
||||
|
||||
String fileName = UUID.randomUUID() + DOT + suffix;
|
||||
|
||||
this.downloadPicture(inputStream, getPath(), fileName);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
finally {
|
||||
try {
|
||||
if (httpGet != null) {
|
||||
httpGet.clone();
|
||||
}
|
||||
} catch (CloneNotSupportedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
try {
|
||||
httpClient.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
try {
|
||||
if (inputStream != null) {
|
||||
inputStream.close();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
//链接url下载图片
|
||||
private void downloadPicture(InputStream inputStream, String path, String fileName) {
|
||||
try {
|
||||
DataInputStream dataInputStream = new DataInputStream(inputStream);
|
||||
|
||||
//拼接文件路径
|
||||
String newPath=path+ File.separator+DateUtil.format(new Date(), DatePattern.NORM_MONTH_PATTERN)+File.separator
|
||||
+DateUtil.format(new Date(), "dd")+"日";
|
||||
|
||||
//如果文件夹不存在则创建
|
||||
File file = new File(newPath);
|
||||
if (!file.exists()) {
|
||||
file.mkdirs();
|
||||
}
|
||||
|
||||
String absolutePath = file.getAbsolutePath();
|
||||
String absolute = absolutePath + File.separator + fileName;
|
||||
|
||||
FileOutputStream f = new FileOutputStream(absolute);
|
||||
ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||
|
||||
byte[] buffer = new byte[1024];
|
||||
int length;
|
||||
|
||||
while ((length = dataInputStream.read(buffer)) > 0) {
|
||||
out.write(buffer, 0, length);
|
||||
}
|
||||
|
||||
f.write(out.toByteArray());
|
||||
dataInputStream.close();
|
||||
f.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 从缓存 -> 数据库 -> 内存 中获取磁盘地址
|
||||
*
|
||||
* @return 地址
|
||||
*/
|
||||
private String getPath() {
|
||||
//磁盘路径
|
||||
String path;
|
||||
//判断redis中是否存在
|
||||
Boolean hasKey = redisService.hasKey(REDIS_KEY);
|
||||
if (hasKey) {
|
||||
path = redisService.getCacheObject(REDIS_KEY);
|
||||
} else {
|
||||
String data = remoteConfigService.getConfigKeyForRPC(CONFIG_KEY).getData();
|
||||
if (StringUtils.isNotEmpty(data)) {
|
||||
path = data;
|
||||
} else {
|
||||
path = WeiXinConst.PATH;
|
||||
}
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,64 @@
|
||||
package com.xjs.weixin.webmagic;
|
||||
|
||||
import com.ruoyi.common.redis.service.RedisService;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_LINK_COUNT;
|
||||
|
||||
/**
|
||||
* 微信链接爬虫
|
||||
*
|
||||
* @author xiejs
|
||||
* @since 2022-03-17
|
||||
*/
|
||||
@Component
|
||||
@Log4j2
|
||||
public class WeiXinLinkProcessor implements PageProcessor {
|
||||
|
||||
@Autowired
|
||||
private RedisService redisService;
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
try {
|
||||
Integer count = redisService.getCacheObject(REPTILE_WEIXIN_LINK_COUNT);
|
||||
if (count == null) {
|
||||
count = 0;
|
||||
}
|
||||
|
||||
List<String> linkList = page.getHtml().css("section > section > img", "data-src").all();
|
||||
|
||||
page.putField("linkList",linkList);
|
||||
|
||||
log.info("linkList----{}",linkList);
|
||||
|
||||
count= linkList.size();
|
||||
|
||||
redisService.setCacheObject(REPTILE_WEIXIN_LINK_COUNT, count );
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage());
|
||||
} finally {
|
||||
redisService.expire(REPTILE_WEIXIN_LINK_COUNT, 3, TimeUnit.HOURS);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me()
|
||||
//.addHeader(headerKey, headerValue)
|
||||
.setCharset("utf8")//设置字符编码
|
||||
.setTimeOut(2000)//设置超时时间
|
||||
.setRetrySleepTime(100)//设置重试间隔时间
|
||||
.setCycleRetryTimes(10)//设置重试次数
|
||||
.setSleepTime(1)//设置两个页面之间的间隔时间
|
||||
;
|
||||
}
|
||||
}
|
Loading…
Reference in new issue