1、修改36壁纸提供内部调用rpc的方法名

2、实现微信搜狗爬虫并把数据持久化到数据库
pull/254/head
xjs 4 years ago
parent 856166612a
commit 58943979ac

@ -18,7 +18,7 @@ import org.springframework.web.bind.annotation.GetMapping;
public interface RemoteWebmagic36wallpaperFeign {
@GetMapping("/_36wallpaper/taskForPRC")
public R _36wallpaperControllerTaskForPRC();
public R _36wallpaperTaskForPRC();
}

@ -18,7 +18,7 @@ public class RemoteWebmagic36wallpaperFactory implements FallbackFactory<RemoteW
public RemoteWebmagic36wallpaperFeign create(Throwable cause) {
return new RemoteWebmagic36wallpaperFeign() {
@Override
public R _36wallpaperControllerTaskForPRC() {
public R _36wallpaperTaskForPRC() {
log.error("新浪 爬虫定时任务 降级------服务可能正在运行");
return R.fail("降级处理------服务可能正在运行");
}

@ -30,7 +30,7 @@ public class CopyWritingTask {
*/
public void execute() {
log.info("---------------文案定时任务Start-------------------");
for (int i = 0; i < 6; i++) {
for (int i = 0; i < 8; i++) {
LocalDateTime localDateTime1 = DateUtil.date().toLocalDateTime();
R<CopyWriting> r = remoteCopyWritingFeign.copyWriting();
log.info("文案定时任务[{}]结果:code={},msg={},data={}",i,r.getCode(),r.getMsg(),r.getData());

@ -29,7 +29,7 @@ public class _36wallpaperTask {
log.info("---------------爬虫-36壁纸网定时任务Start-------------------");
LocalDateTime localDateTime1 = DateUtil.date().toLocalDateTime();
R r = remoteWebmagic36wallpaperFeign._36wallpaperControllerTaskForPRC();
R r = remoteWebmagic36wallpaperFeign._36wallpaperTaskForPRC();
log.info("爬虫-36壁纸网定时任务结果:code={},msg={},data={}",r.getCode(),r.getMsg(),r.getData());
LocalDateTime localDateTime2 = DateUtil.date().toLocalDateTime();

@ -47,7 +47,10 @@ public class RedisConst {
/**
*
*/
public static final String REPTILE_COUNT= "reptile:_36wallpaper.count";
public static final String REPTILE_36_WALLPAPER_COUNT= "reptile:_36wallpaper.count";
public static final String REPTILE_WEIXIN_SOUGOU_COUNT= "reptile:weixin.sougou.count";
//-------------------有效时间-----------------------
public static final Integer TRAN_DICT_EXPIRE = 1; //小时

@ -30,9 +30,16 @@ public class ReptileConst {
*/
public static final String BOSS_JOB_URL= "https://www.zhipin.com";
/**
* 36url
*/
public static final String _36_WALLPAPER_URL= "https://www.3gbizhi.com/";
/**
* url
*/
public static final String WEIXIN_SOUGOU_URL= "https://weixin.sogou.com/";

@ -64,7 +64,7 @@ public class _36wallpaperController extends MyBaseController {
//----------------------远程rpc调用---------------------------
@GetMapping("taskForPRC")
@ApiOperation("供定时任务服务RPC远程调用")
public R _36wallpaperControllerTaskForPRC() {
public R _36wallpaperTaskForPRC() {
Long count = wallpaperTask.reptileWallpaper();
return R.ok(count);
}

@ -17,7 +17,7 @@ import javax.annotation.Resource;
import java.util.regex.Pattern;
import static com.xjs._36wallpaper.consts._36wallpaperConst.*;
import static com.xjs.consts.RedisConst.REPTILE_COUNT;
import static com.xjs.consts.RedisConst.REPTILE_36_WALLPAPER_COUNT;
import static com.xjs.consts.RegexConst.FILE_PATH_REGEX;
/**
@ -134,7 +134,7 @@ public class _36wallpaperServiceImpl extends ServiceImpl<_36wallpaperMapper, _36
*/
private void checkRunning() {
//判断爬虫是否正在执行,正在执行不可修改!
if(redisService.hasKey(REPTILE_COUNT)){
if(redisService.hasKey(REPTILE_36_WALLPAPER_COUNT)){
throw new BusinessException("爬虫正在执行中!暂时无法修改,请稍后再试");
}
}

@ -28,7 +28,7 @@ import java.util.concurrent.TimeUnit;
import static com.xjs._36wallpaper.consts._36wallpaperConst.CONFIG_KEY;
import static com.xjs._36wallpaper.consts._36wallpaperConst.REDIS_KEY;
import static com.xjs.consts.RedisConst.REPTILE_COUNT;
import static com.xjs.consts.RedisConst.REPTILE_36_WALLPAPER_COUNT;
import static com.xjs.consts.ReptileConst._36_WALLPAPER_URL;
/**
@ -238,16 +238,16 @@ public class _36wallpaperProcessor implements PageProcessor {
//page.putField("_36wallpaperData",wallpapers);
//循环次数存入redis中
Integer count = redisService.getCacheObject(REPTILE_COUNT);
Integer count = redisService.getCacheObject(REPTILE_36_WALLPAPER_COUNT);
if (count == null) {
count = 0;
}
redisService.setCacheObject(REPTILE_COUNT, count + 1);
redisService.setCacheObject(REPTILE_36_WALLPAPER_COUNT, count + 1);
} catch (Exception e) {
e.printStackTrace();
} finally {
redisService.expire(REPTILE_COUNT, 1, TimeUnit.HOURS);
redisService.expire(REPTILE_36_WALLPAPER_COUNT, 1, TimeUnit.HOURS);
}
}
@ -291,8 +291,8 @@ public class _36wallpaperProcessor implements PageProcessor {
log.info("36壁纸删除重复数据数" + count);
//从redis中获取循环次数
Integer cache = redisService.getCacheObject(REPTILE_COUNT);
redisService.deleteObject(REPTILE_COUNT);
Integer cache = redisService.getCacheObject(REPTILE_36_WALLPAPER_COUNT);
redisService.deleteObject(REPTILE_36_WALLPAPER_COUNT);
if (cache != null) {
return Long.valueOf(cache);

@ -0,0 +1,35 @@
package com.xjs.weixin.controller;
import com.ruoyi.common.core.domain.R;
import com.xjs.weixin.task.WeiXinSouGouTask;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
/**
* controller
* @author xiejs
* @since 2022-02-22
*/
@RestController
@RequestMapping("weixin_sougou")
@Api(tags = "爬虫模块-微信搜狗")
public class WeiXinSouGouController {
@Autowired
private WeiXinSouGouTask weiXinSouGouTask;
//----------------------远程rpc调用---------------------------
@GetMapping("taskForPRC")
@ApiOperation("供定时任务服务RPC远程调用")
public R WeiXinSouGouTaskForPRC() {
Long count = weiXinSouGouTask.reptileWeiXinSouGou();
return R.ok(count);
}
}

@ -0,0 +1,18 @@
package com.xjs.weixin.mapper;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.xjs.weixin.pojo.WeiXinSouGou;
/**
* mapper
* @author xiejs
* @since 2022-02-22
*/
public interface WeiXinSouGouMapper extends BaseMapper<WeiXinSouGou> {
/**
*
* @return int
*/
int deleteRepeatData();
}

@ -0,0 +1,51 @@
package com.xjs.weixin.pojo;
import com.baomidou.mybatisplus.annotation.FieldFill;
import com.baomidou.mybatisplus.annotation.TableField;
import com.baomidou.mybatisplus.annotation.TableName;
import com.ruoyi.common.core.annotation.Excel;
import lombok.Data;
import lombok.experimental.Accessors;
import java.io.Serializable;
import java.util.Date;
/**
*
* @author xiejs
* @since 2022-02-22
*/
@Data
@Accessors(chain = true)
@TableName("webmagic_weixin_sougou")
public class WeiXinSouGou implements Serializable {
private static final long serialVersionUID = 1L;
/** 主键id */
private Long id;
/** 文章标题 */
@Excel(name = "文章标题")
private String title;
/** 简略的内容 */
@Excel(name = "简略的内容")
private String content;
/** 文章来源 */
@Excel(name = "文章来源")
private String source;
/** 文章的链接 */
@Excel(name = "文章的链接")
private String url;
/** 图片的链接 */
@Excel(name = "图片的链接")
private String imgUrl;
@Excel(name = "创建时间",dateFormat = "yyyy-MM-dd HH:mm:ss")
@TableField(fill = FieldFill.INSERT)
private Date createTime;
}

@ -0,0 +1,18 @@
package com.xjs.weixin.service;
import com.baomidou.mybatisplus.extension.service.IService;
import com.xjs.weixin.pojo.WeiXinSouGou;
/**
* service
* @author xiejs
* @since 2022-02-22
*/
public interface WeiXinSouGouService extends IService<WeiXinSouGou> {
/**
*
* @return int
*/
int deleteRepeatData();
}

@ -0,0 +1,26 @@
package com.xjs.weixin.service.impl;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.xjs.weixin.mapper.WeiXinSouGouMapper;
import com.xjs.weixin.pojo.WeiXinSouGou;
import com.xjs.weixin.service.WeiXinSouGouService;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
/**
* service
* @author xiejs
* @since 2022-02-22
*/
@Service
public class WeiXinSouGouServiceImpl extends ServiceImpl<WeiXinSouGouMapper, WeiXinSouGou> implements WeiXinSouGouService {
@Resource
private WeiXinSouGouMapper weiXinSouGouMapper;
@Override
public int deleteRepeatData() {
return weiXinSouGouMapper.deleteRepeatData();
}
}

@ -0,0 +1,37 @@
package com.xjs.weixin.task;
import com.xjs.annotation.ReptileLog;
import com.xjs.weixin.service.WeiXinSouGouService;
import com.xjs.weixin.webmagic.WeiXinSouGouProcessor;
import lombok.extern.log4j.Log4j2;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import static com.xjs.consts.ReptileConst.WEIXIN_SOUGOU_URL;
/**
*
* @author xiejs
* @since 2022-02-22
*/
@Component
@Log4j2
public class WeiXinSouGouTask {
@Autowired
private WeiXinSouGouProcessor weiXinSouGouProcessor;
@Autowired
private WeiXinSouGouService weiXinSouGouService;
@ReptileLog(name = "微信搜狗", url = WEIXIN_SOUGOU_URL)
public Long reptileWeiXinSouGou() {
Long run = weiXinSouGouProcessor.run();
//删除重复数据
int count = weiXinSouGouService.deleteRepeatData();
log.info("微信搜狗删除重复数据数:" + count);
return run;
}
}

@ -0,0 +1,37 @@
package com.xjs.weixin.webmagic;
import cn.hutool.core.collection.CollUtil;
import com.xjs.weixin.pojo.WeiXinSouGou;
import com.xjs.weixin.service.WeiXinSouGouService;
import lombok.extern.log4j.Log4j2;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.List;
/**
*
* @author xiejs
* @since 2022-02-22
*/
@Component
@Log4j2
public class WeiXinSouGouPipeline implements Pipeline {
@Autowired
private WeiXinSouGouService weiXinSouGouService;
@Override
public void process(ResultItems resultItems, Task task) {
List<WeiXinSouGou> weiXinSouGouList =resultItems.get("weiXinSouGouList");
if (CollUtil.isNotEmpty(weiXinSouGouList)) {
weiXinSouGouService.saveBatch(weiXinSouGouList, 25);
}
}
}

@ -0,0 +1,167 @@
package com.xjs.weixin.webmagic;
import com.ruoyi.common.redis.service.RedisService;
import com.xjs.weixin.pojo.WeiXinSouGou;
import lombok.extern.log4j.Log4j2;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_SOUGOU_COUNT;
import static com.xjs.consts.ReptileConst.WEIXIN_SOUGOU_URL;
/**
*
*
* @author xiejs
* @since 2022-02-21
*/
@Log4j2
@Component
public class WeiXinSouGouProcessor implements PageProcessor {
@Autowired
private WeiXinSouGouPipeline weiXinSouGouPipeline;
private static RedisService redisService;
@Autowired
public void setRedisService(RedisService redisService) {
WeiXinSouGouProcessor.redisService = redisService;
}
@Override
public void process(Page page) {
try {
//复杂度计算
//循环次数存入redis中
Integer count = redisService.getCacheObject(REPTILE_WEIXIN_SOUGOU_COUNT);
if (count == null) {
count = 0;
}
List<Selectable> nodes = page.getHtml().css("#type_tab > .fieed-box a").nodes();
//1、获取需要爬取的路径
Set<String> set = new HashSet<>();
for (Selectable node : nodes) {
count++;
String s = node.get();
Document parse = Jsoup.parse(s);
String id = parse.select("a").attr("id");
set.add(id);
}
set.removeIf(s -> !s.contains("pc_"));
List<String> list = new ArrayList<>();
for (String s : set) {
count++;
s = WEIXIN_SOUGOU_URL + "/pcindex/pc/" + s + "/" + s + ".html";
list.add(s);
}
//2、把所有连接加入到队列
page.addTargetRequests(list);
//3、获取需要的参数
List<Selectable> newsNodes = page.getHtml().css(".news-list > li").nodes();
List<WeiXinSouGou> weiXinSouGouList = new ArrayList<>();
for (Selectable newsNode : newsNodes) {
count++;
String s = newsNode.get();
//文章具体路径
String link = newsNode.css(".img-box > a", "href").get();
//图片路径
String imgSrc = newsNode.css(".img-box > a > img", "src").get();
//标题
String title = newsNode.css(".txt-box > h3 > a", "text").get();
//省略的内容
String content = newsNode.css(".txt-box > .txt-info", "text").get();
//来源
String source = newsNode.css(".s-p > a", "text").get();
WeiXinSouGou weiXinSouGou = new WeiXinSouGou()
.setUrl(link)
.setImgUrl(imgSrc)
.setTitle(title)
.setContent(content)
.setSource(source);
weiXinSouGouList.add(weiXinSouGou);
}
page.putField("weiXinSouGouList",weiXinSouGouList);
redisService.setCacheObject(REPTILE_WEIXIN_SOUGOU_COUNT, count + 1);
} catch (Exception e) {
e.printStackTrace();
} finally {
redisService.expire(REPTILE_WEIXIN_SOUGOU_COUNT, 3, TimeUnit.HOURS);
}
}
@Override
public Site getSite() {
return Site.me()
//.addHeader(headerKey, headerValue)
.setCharset("utf8")//设置字符编码
.setTimeOut(2000)//设置超时时间
.setRetrySleepTime(100)//设置重试间隔时间
.setCycleRetryTimes(10)//设置重试次数
.setSleepTime(1)//设置两个页面之间的间隔时间
;
}
/**
*
*
* @return
*/
public Long run() {
//执行爬虫
Spider.create(new WeiXinSouGouProcessor())
.addUrl(WEIXIN_SOUGOU_URL)//设置爬取地址
.thread(30)//设置爬取线程数
.setScheduler(new QueueScheduler()
.setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器
.addPipeline(weiXinSouGouPipeline)//设置爬取之后的数据操作
//.setDownloader(downloader)//设置下载器
.run();//执行
//从redis中获取循环次数
Integer cache = redisService.getCacheObject(REPTILE_WEIXIN_SOUGOU_COUNT);
redisService.deleteObject(REPTILE_WEIXIN_SOUGOU_COUNT);
if (cache != null) {
return Long.valueOf(cache);
}
return 0L;
}
}

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper
PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.xjs.weixin.mapper.WeiXinSouGouMapper">
<delete id="deleteRepeatData">
delete from webmagic_weixin_sougou where id not in (
SELECT
t.min_id
FROM
( SELECT min( id ) AS min_id FROM webmagic_weixin_sougou GROUP BY url ) AS t
)
</delete>
</mapper>

@ -0,0 +1,26 @@
package com.xjs.weixin.task;
import com.xjs.XjsWebmagicApp;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import static org.junit.jupiter.api.Assertions.*;
/**
* @author xiejs
* @since 2022-02-22
*/
@SpringBootTest(classes = XjsWebmagicApp.class)
class WeiXinSouGouTaskTest {
@Autowired
WeiXinSouGouTask task;
@Test
void reptileWeiXinSouGou() {
Long aLong = task.reptileWeiXinSouGou();
System.out.println(aLong);
}
}
Loading…
Cancel
Save