parent
856166612a
commit
58943979ac
@ -0,0 +1,35 @@
|
||||
package com.xjs.weixin.controller;
|
||||
|
||||
import com.ruoyi.common.core.domain.R;
|
||||
import com.xjs.weixin.task.WeiXinSouGouTask;
|
||||
import io.swagger.annotations.Api;
|
||||
import io.swagger.annotations.ApiOperation;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
/**
|
||||
* 微信搜狗controller
|
||||
* @author xiejs
|
||||
* @since 2022-02-22
|
||||
*/
|
||||
@RestController
|
||||
@RequestMapping("weixin_sougou")
|
||||
@Api(tags = "爬虫模块-微信搜狗")
|
||||
public class WeiXinSouGouController {
|
||||
|
||||
@Autowired
|
||||
private WeiXinSouGouTask weiXinSouGouTask;
|
||||
|
||||
|
||||
|
||||
//----------------------远程rpc调用---------------------------
|
||||
@GetMapping("taskForPRC")
|
||||
@ApiOperation("供定时任务服务RPC远程调用")
|
||||
public R WeiXinSouGouTaskForPRC() {
|
||||
Long count = weiXinSouGouTask.reptileWeiXinSouGou();
|
||||
return R.ok(count);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
package com.xjs.weixin.mapper;
|
||||
|
||||
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
|
||||
import com.xjs.weixin.pojo.WeiXinSouGou;
|
||||
|
||||
/**
|
||||
* 微信搜狗mapper
|
||||
* @author xiejs
|
||||
* @since 2022-02-22
|
||||
*/
|
||||
public interface WeiXinSouGouMapper extends BaseMapper<WeiXinSouGou> {
|
||||
|
||||
/**
|
||||
* 删除重复数据
|
||||
* @return int
|
||||
*/
|
||||
int deleteRepeatData();
|
||||
}
|
@ -0,0 +1,51 @@
|
||||
package com.xjs.weixin.pojo;
|
||||
|
||||
import com.baomidou.mybatisplus.annotation.FieldFill;
|
||||
import com.baomidou.mybatisplus.annotation.TableField;
|
||||
import com.baomidou.mybatisplus.annotation.TableName;
|
||||
import com.ruoyi.common.core.annotation.Excel;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.Accessors;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* 微信搜狗爬虫数据实体类
|
||||
* @author xiejs
|
||||
* @since 2022-02-22
|
||||
*/
|
||||
@Data
|
||||
@Accessors(chain = true)
|
||||
@TableName("webmagic_weixin_sougou")
|
||||
public class WeiXinSouGou implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
/** 主键id */
|
||||
private Long id;
|
||||
|
||||
/** 文章标题 */
|
||||
@Excel(name = "文章标题")
|
||||
private String title;
|
||||
|
||||
/** 简略的内容 */
|
||||
@Excel(name = "简略的内容")
|
||||
private String content;
|
||||
|
||||
/** 文章来源 */
|
||||
@Excel(name = "文章来源")
|
||||
private String source;
|
||||
|
||||
/** 文章的链接 */
|
||||
@Excel(name = "文章的链接")
|
||||
private String url;
|
||||
|
||||
/** 图片的链接 */
|
||||
@Excel(name = "图片的链接")
|
||||
private String imgUrl;
|
||||
|
||||
@Excel(name = "创建时间",dateFormat = "yyyy-MM-dd HH:mm:ss")
|
||||
@TableField(fill = FieldFill.INSERT)
|
||||
private Date createTime;
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
package com.xjs.weixin.service;
|
||||
|
||||
import com.baomidou.mybatisplus.extension.service.IService;
|
||||
import com.xjs.weixin.pojo.WeiXinSouGou;
|
||||
|
||||
/**
|
||||
* 微信搜狗service接口
|
||||
* @author xiejs
|
||||
* @since 2022-02-22
|
||||
*/
|
||||
public interface WeiXinSouGouService extends IService<WeiXinSouGou> {
|
||||
|
||||
/**
|
||||
* 删除重复数据
|
||||
* @return int
|
||||
*/
|
||||
int deleteRepeatData();
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
package com.xjs.weixin.service.impl;
|
||||
|
||||
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
|
||||
import com.xjs.weixin.mapper.WeiXinSouGouMapper;
|
||||
import com.xjs.weixin.pojo.WeiXinSouGou;
|
||||
import com.xjs.weixin.service.WeiXinSouGouService;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
|
||||
/**
|
||||
* 微信搜狗service实现
|
||||
* @author xiejs
|
||||
* @since 2022-02-22
|
||||
*/
|
||||
@Service
|
||||
public class WeiXinSouGouServiceImpl extends ServiceImpl<WeiXinSouGouMapper, WeiXinSouGou> implements WeiXinSouGouService {
|
||||
|
||||
@Resource
|
||||
private WeiXinSouGouMapper weiXinSouGouMapper;
|
||||
|
||||
@Override
|
||||
public int deleteRepeatData() {
|
||||
return weiXinSouGouMapper.deleteRepeatData();
|
||||
}
|
||||
}
|
@ -0,0 +1,37 @@
|
||||
package com.xjs.weixin.task;
|
||||
|
||||
import com.xjs.annotation.ReptileLog;
|
||||
import com.xjs.weixin.service.WeiXinSouGouService;
|
||||
import com.xjs.weixin.webmagic.WeiXinSouGouProcessor;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import static com.xjs.consts.ReptileConst.WEIXIN_SOUGOU_URL;
|
||||
|
||||
/**
|
||||
* 微信搜狗任务
|
||||
* @author xiejs
|
||||
* @since 2022-02-22
|
||||
*/
|
||||
@Component
|
||||
@Log4j2
|
||||
public class WeiXinSouGouTask {
|
||||
|
||||
@Autowired
|
||||
private WeiXinSouGouProcessor weiXinSouGouProcessor;
|
||||
@Autowired
|
||||
private WeiXinSouGouService weiXinSouGouService;
|
||||
|
||||
@ReptileLog(name = "微信搜狗", url = WEIXIN_SOUGOU_URL)
|
||||
public Long reptileWeiXinSouGou() {
|
||||
Long run = weiXinSouGouProcessor.run();
|
||||
|
||||
//删除重复数据
|
||||
int count = weiXinSouGouService.deleteRepeatData();
|
||||
log.info("微信搜狗删除重复数据数:" + count);
|
||||
|
||||
return run;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,37 @@
|
||||
package com.xjs.weixin.webmagic;
|
||||
|
||||
import cn.hutool.core.collection.CollUtil;
|
||||
import com.xjs.weixin.pojo.WeiXinSouGou;
|
||||
import com.xjs.weixin.service.WeiXinSouGouService;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 微信搜狗网 爬虫数据处理
|
||||
* @author xiejs
|
||||
* @since 2022-02-22
|
||||
*/
|
||||
@Component
|
||||
@Log4j2
|
||||
public class WeiXinSouGouPipeline implements Pipeline {
|
||||
|
||||
@Autowired
|
||||
private WeiXinSouGouService weiXinSouGouService;
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
List<WeiXinSouGou> weiXinSouGouList =resultItems.get("weiXinSouGouList");
|
||||
|
||||
if (CollUtil.isNotEmpty(weiXinSouGouList)) {
|
||||
weiXinSouGouService.saveBatch(weiXinSouGouList, 25);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,167 @@
|
||||
package com.xjs.weixin.webmagic;
|
||||
|
||||
import com.ruoyi.common.redis.service.RedisService;
|
||||
import com.xjs.weixin.pojo.WeiXinSouGou;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
|
||||
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
||||
import us.codecraft.webmagic.selector.Selectable;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_SOUGOU_COUNT;
|
||||
import static com.xjs.consts.ReptileConst.WEIXIN_SOUGOU_URL;
|
||||
|
||||
/**
|
||||
* 微信搜狗网 爬虫处理
|
||||
*
|
||||
* @author xiejs
|
||||
* @since 2022-02-21
|
||||
*/
|
||||
@Log4j2
|
||||
@Component
|
||||
public class WeiXinSouGouProcessor implements PageProcessor {
|
||||
|
||||
@Autowired
|
||||
private WeiXinSouGouPipeline weiXinSouGouPipeline;
|
||||
|
||||
private static RedisService redisService;
|
||||
|
||||
@Autowired
|
||||
public void setRedisService(RedisService redisService) {
|
||||
WeiXinSouGouProcessor.redisService = redisService;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
try {
|
||||
//复杂度计算
|
||||
//循环次数存入redis中
|
||||
Integer count = redisService.getCacheObject(REPTILE_WEIXIN_SOUGOU_COUNT);
|
||||
if (count == null) {
|
||||
count = 0;
|
||||
}
|
||||
|
||||
List<Selectable> nodes = page.getHtml().css("#type_tab > .fieed-box a").nodes();
|
||||
|
||||
//1、获取需要爬取的路径
|
||||
Set<String> set = new HashSet<>();
|
||||
for (Selectable node : nodes) {
|
||||
count++;
|
||||
|
||||
String s = node.get();
|
||||
|
||||
Document parse = Jsoup.parse(s);
|
||||
|
||||
String id = parse.select("a").attr("id");
|
||||
|
||||
set.add(id);
|
||||
}
|
||||
set.removeIf(s -> !s.contains("pc_"));
|
||||
List<String> list = new ArrayList<>();
|
||||
for (String s : set) {
|
||||
count++;
|
||||
|
||||
s = WEIXIN_SOUGOU_URL + "/pcindex/pc/" + s + "/" + s + ".html";
|
||||
list.add(s);
|
||||
}
|
||||
|
||||
//2、把所有连接加入到队列
|
||||
page.addTargetRequests(list);
|
||||
|
||||
//3、获取需要的参数
|
||||
List<Selectable> newsNodes = page.getHtml().css(".news-list > li").nodes();
|
||||
List<WeiXinSouGou> weiXinSouGouList = new ArrayList<>();
|
||||
for (Selectable newsNode : newsNodes) {
|
||||
count++;
|
||||
|
||||
String s = newsNode.get();
|
||||
|
||||
//文章具体路径
|
||||
String link = newsNode.css(".img-box > a", "href").get();
|
||||
|
||||
//图片路径
|
||||
String imgSrc = newsNode.css(".img-box > a > img", "src").get();
|
||||
|
||||
//标题
|
||||
String title = newsNode.css(".txt-box > h3 > a", "text").get();
|
||||
|
||||
//省略的内容
|
||||
String content = newsNode.css(".txt-box > .txt-info", "text").get();
|
||||
|
||||
//来源
|
||||
String source = newsNode.css(".s-p > a", "text").get();
|
||||
|
||||
WeiXinSouGou weiXinSouGou = new WeiXinSouGou()
|
||||
.setUrl(link)
|
||||
.setImgUrl(imgSrc)
|
||||
.setTitle(title)
|
||||
.setContent(content)
|
||||
.setSource(source);
|
||||
weiXinSouGouList.add(weiXinSouGou);
|
||||
}
|
||||
|
||||
page.putField("weiXinSouGouList",weiXinSouGouList);
|
||||
|
||||
redisService.setCacheObject(REPTILE_WEIXIN_SOUGOU_COUNT, count + 1);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
redisService.expire(REPTILE_WEIXIN_SOUGOU_COUNT, 3, TimeUnit.HOURS);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me()
|
||||
//.addHeader(headerKey, headerValue)
|
||||
.setCharset("utf8")//设置字符编码
|
||||
.setTimeOut(2000)//设置超时时间
|
||||
.setRetrySleepTime(100)//设置重试间隔时间
|
||||
.setCycleRetryTimes(10)//设置重试次数
|
||||
.setSleepTime(1)//设置两个页面之间的间隔时间
|
||||
;
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行爬虫
|
||||
*
|
||||
* @return 返回循环次数
|
||||
*/
|
||||
public Long run() {
|
||||
//执行爬虫
|
||||
Spider.create(new WeiXinSouGouProcessor())
|
||||
.addUrl(WEIXIN_SOUGOU_URL)//设置爬取地址
|
||||
.thread(30)//设置爬取线程数
|
||||
.setScheduler(new QueueScheduler()
|
||||
.setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器
|
||||
.addPipeline(weiXinSouGouPipeline)//设置爬取之后的数据操作
|
||||
//.setDownloader(downloader)//设置下载器
|
||||
.run();//执行
|
||||
|
||||
//从redis中获取循环次数
|
||||
Integer cache = redisService.getCacheObject(REPTILE_WEIXIN_SOUGOU_COUNT);
|
||||
redisService.deleteObject(REPTILE_WEIXIN_SOUGOU_COUNT);
|
||||
|
||||
if (cache != null) {
|
||||
return Long.valueOf(cache);
|
||||
}
|
||||
|
||||
return 0L;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,16 @@
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!DOCTYPE mapper
|
||||
PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
|
||||
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
|
||||
<mapper namespace="com.xjs.weixin.mapper.WeiXinSouGouMapper">
|
||||
|
||||
|
||||
<delete id="deleteRepeatData">
|
||||
delete from webmagic_weixin_sougou where id not in (
|
||||
SELECT
|
||||
t.min_id
|
||||
FROM
|
||||
( SELECT min( id ) AS min_id FROM webmagic_weixin_sougou GROUP BY url ) AS t
|
||||
)
|
||||
</delete>
|
||||
</mapper>
|
@ -0,0 +1,26 @@
|
||||
package com.xjs.weixin.task;
|
||||
|
||||
import com.xjs.XjsWebmagicApp;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
/**
|
||||
* @author xiejs
|
||||
* @since 2022-02-22
|
||||
*/
|
||||
@SpringBootTest(classes = XjsWebmagicApp.class)
|
||||
class WeiXinSouGouTaskTest {
|
||||
|
||||
@Autowired
|
||||
WeiXinSouGouTask task;
|
||||
|
||||
@Test
|
||||
void reptileWeiXinSouGou() {
|
||||
|
||||
Long aLong = task.reptileWeiXinSouGou();
|
||||
System.out.println(aLong);
|
||||
}
|
||||
}
|
Loading…
Reference in new issue