From 58943979ace719b85e8594b50c046b594dfc8510 Mon Sep 17 00:00:00 2001 From: xjs <1294405880@qq.com> Date: Tue, 22 Feb 2022 09:47:46 +0800 Subject: [PATCH] =?UTF-8?q?1=E3=80=81=E4=BF=AE=E6=94=B936=E5=A3=81?= =?UTF-8?q?=E7=BA=B8=E6=8F=90=E4=BE=9B=E5=86=85=E9=83=A8=E8=B0=83=E7=94=A8?= =?UTF-8?q?rpc=E7=9A=84=E6=96=B9=E6=B3=95=E5=90=8D=202=E3=80=81=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=E5=BE=AE=E4=BF=A1=E6=90=9C=E7=8B=97=E7=88=AC=E8=99=AB?= =?UTF-8?q?=E5=B9=B6=E6=8A=8A=E6=95=B0=E6=8D=AE=E6=8C=81=E4=B9=85=E5=8C=96?= =?UTF-8?q?=E5=88=B0=E6=95=B0=E6=8D=AE=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../RemoteWebmagic36wallpaperFeign.java | 2 +- .../RemoteWebmagic36wallpaperFactory.java | 2 +- .../xjs/job/task/openapi/CopyWritingTask.java | 2 +- .../job/task/webmagic/_36wallpaperTask.java | 2 +- .../main/java/com/xjs/consts/RedisConst.java | 5 +- .../java/com/xjs/consts/ReptileConst.java | 9 +- .../controller/_36wallpaperController.java | 2 +- .../service/impl/_36wallpaperServiceImpl.java | 4 +- .../webmagic/_36wallpaperProcessor.java | 12 +- .../controller/WeiXinSouGouController.java | 35 ++++ .../xjs/weixin/mapper/WeiXinSouGouMapper.java | 18 ++ .../com/xjs/weixin/pojo/WeiXinSouGou.java | 51 ++++++ .../weixin/service/WeiXinSouGouService.java | 18 ++ .../service/impl/WeiXinSouGouServiceImpl.java | 26 +++ .../com/xjs/weixin/task/WeiXinSouGouTask.java | 37 ++++ .../weixin/webmagic/WeiXinSouGouPipeline.java | 37 ++++ .../webmagic/WeiXinSouGouProcessor.java | 167 ++++++++++++++++++ .../mapper/webmagic/WeiXinSouGouMapper.xml | 16 ++ .../xjs/weixin/task/WeiXinSouGouTaskTest.java | 26 +++ 19 files changed, 456 insertions(+), 15 deletions(-) create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/WeiXinSouGouController.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/mapper/WeiXinSouGouMapper.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/pojo/WeiXinSouGou.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/WeiXinSouGouService.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/impl/WeiXinSouGouServiceImpl.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/task/WeiXinSouGouTask.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinSouGouPipeline.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinSouGouProcessor.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/WeiXinSouGouMapper.xml create mode 100644 xjs-business/xjs-business-webmagic/src/test/java/com/xjs/weixin/task/WeiXinSouGouTaskTest.java diff --git a/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/RemoteWebmagic36wallpaperFeign.java b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/RemoteWebmagic36wallpaperFeign.java index 5b01a81b..fc88db8b 100644 --- a/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/RemoteWebmagic36wallpaperFeign.java +++ b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/RemoteWebmagic36wallpaperFeign.java @@ -18,7 +18,7 @@ import org.springframework.web.bind.annotation.GetMapping; public interface RemoteWebmagic36wallpaperFeign { @GetMapping("/_36wallpaper/taskForPRC") - public R _36wallpaperControllerTaskForPRC(); + public R _36wallpaperTaskForPRC(); } diff --git a/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/factory/RemoteWebmagic36wallpaperFactory.java b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/factory/RemoteWebmagic36wallpaperFactory.java index 05a08aeb..74f190d5 100644 --- a/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/factory/RemoteWebmagic36wallpaperFactory.java +++ b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/factory/RemoteWebmagic36wallpaperFactory.java @@ -18,7 +18,7 @@ public class RemoteWebmagic36wallpaperFactory implements FallbackFactory r = remoteCopyWritingFeign.copyWriting(); log.info("文案定时任务[{}]结果:code={},msg={},data={}",i,r.getCode(),r.getMsg(),r.getData()); diff --git a/ruoyi-modules/ruoyi-job/src/main/java/com/xjs/job/task/webmagic/_36wallpaperTask.java b/ruoyi-modules/ruoyi-job/src/main/java/com/xjs/job/task/webmagic/_36wallpaperTask.java index 00afb66d..c7674304 100644 --- a/ruoyi-modules/ruoyi-job/src/main/java/com/xjs/job/task/webmagic/_36wallpaperTask.java +++ b/ruoyi-modules/ruoyi-job/src/main/java/com/xjs/job/task/webmagic/_36wallpaperTask.java @@ -29,7 +29,7 @@ public class _36wallpaperTask { log.info("---------------爬虫-36壁纸网定时任务Start-------------------"); LocalDateTime localDateTime1 = DateUtil.date().toLocalDateTime(); - R r = remoteWebmagic36wallpaperFeign._36wallpaperControllerTaskForPRC(); + R r = remoteWebmagic36wallpaperFeign._36wallpaperTaskForPRC(); log.info("爬虫-36壁纸网定时任务结果:code={},msg={},data={}",r.getCode(),r.getMsg(),r.getData()); LocalDateTime localDateTime2 = DateUtil.date().toLocalDateTime(); diff --git a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java index b750a286..2d16c2d6 100644 --- a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java +++ b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java @@ -47,7 +47,10 @@ public class RedisConst { /** * 爬虫记录循环次数常量信息 */ - public static final String REPTILE_COUNT= "reptile:_36wallpaper.count"; + public static final String REPTILE_36_WALLPAPER_COUNT= "reptile:_36wallpaper.count"; + + + public static final String REPTILE_WEIXIN_SOUGOU_COUNT= "reptile:weixin.sougou.count"; //-------------------有效时间----------------------- public static final Integer TRAN_DICT_EXPIRE = 1; //小时 diff --git a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java index 558f208f..e1619d37 100644 --- a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java +++ b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java @@ -30,9 +30,16 @@ public class ReptileConst { */ public static final String BOSS_JOB_URL= "https://www.zhipin.com"; - + /** + * 36壁纸网url + */ public static final String _36_WALLPAPER_URL= "https://www.3gbizhi.com/"; + /** + * 搜狗微信url + */ + public static final String WEIXIN_SOUGOU_URL= "https://weixin.sogou.com/"; + diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/controller/_36wallpaperController.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/controller/_36wallpaperController.java index c9595325..1a3f9fb2 100644 --- a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/controller/_36wallpaperController.java +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/controller/_36wallpaperController.java @@ -64,7 +64,7 @@ public class _36wallpaperController extends MyBaseController { //----------------------远程rpc调用--------------------------- @GetMapping("taskForPRC") @ApiOperation("供定时任务服务RPC远程调用") - public R _36wallpaperControllerTaskForPRC() { + public R _36wallpaperTaskForPRC() { Long count = wallpaperTask.reptileWallpaper(); return R.ok(count); } diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/service/impl/_36wallpaperServiceImpl.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/service/impl/_36wallpaperServiceImpl.java index cdc57bf9..71428dd4 100644 --- a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/service/impl/_36wallpaperServiceImpl.java +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/service/impl/_36wallpaperServiceImpl.java @@ -17,7 +17,7 @@ import javax.annotation.Resource; import java.util.regex.Pattern; import static com.xjs._36wallpaper.consts._36wallpaperConst.*; -import static com.xjs.consts.RedisConst.REPTILE_COUNT; +import static com.xjs.consts.RedisConst.REPTILE_36_WALLPAPER_COUNT; import static com.xjs.consts.RegexConst.FILE_PATH_REGEX; /** @@ -134,7 +134,7 @@ public class _36wallpaperServiceImpl extends ServiceImpl<_36wallpaperMapper, _36 */ private void checkRunning() { //判断爬虫是否正在执行,正在执行不可修改! - if(redisService.hasKey(REPTILE_COUNT)){ + if(redisService.hasKey(REPTILE_36_WALLPAPER_COUNT)){ throw new BusinessException("爬虫正在执行中!暂时无法修改,请稍后再试"); } } diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/webmagic/_36wallpaperProcessor.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/webmagic/_36wallpaperProcessor.java index 8f026d2b..01f1d111 100644 --- a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/webmagic/_36wallpaperProcessor.java +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/webmagic/_36wallpaperProcessor.java @@ -28,7 +28,7 @@ import java.util.concurrent.TimeUnit; import static com.xjs._36wallpaper.consts._36wallpaperConst.CONFIG_KEY; import static com.xjs._36wallpaper.consts._36wallpaperConst.REDIS_KEY; -import static com.xjs.consts.RedisConst.REPTILE_COUNT; +import static com.xjs.consts.RedisConst.REPTILE_36_WALLPAPER_COUNT; import static com.xjs.consts.ReptileConst._36_WALLPAPER_URL; /** @@ -238,16 +238,16 @@ public class _36wallpaperProcessor implements PageProcessor { //page.putField("_36wallpaperData",wallpapers); //循环次数存入redis中 - Integer count = redisService.getCacheObject(REPTILE_COUNT); + Integer count = redisService.getCacheObject(REPTILE_36_WALLPAPER_COUNT); if (count == null) { count = 0; } - redisService.setCacheObject(REPTILE_COUNT, count + 1); + redisService.setCacheObject(REPTILE_36_WALLPAPER_COUNT, count + 1); } catch (Exception e) { e.printStackTrace(); } finally { - redisService.expire(REPTILE_COUNT, 1, TimeUnit.HOURS); + redisService.expire(REPTILE_36_WALLPAPER_COUNT, 1, TimeUnit.HOURS); } } @@ -291,8 +291,8 @@ public class _36wallpaperProcessor implements PageProcessor { log.info("36壁纸删除重复数据数:" + count); //从redis中获取循环次数 - Integer cache = redisService.getCacheObject(REPTILE_COUNT); - redisService.deleteObject(REPTILE_COUNT); + Integer cache = redisService.getCacheObject(REPTILE_36_WALLPAPER_COUNT); + redisService.deleteObject(REPTILE_36_WALLPAPER_COUNT); if (cache != null) { return Long.valueOf(cache); diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/WeiXinSouGouController.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/WeiXinSouGouController.java new file mode 100644 index 00000000..e0b0df23 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/WeiXinSouGouController.java @@ -0,0 +1,35 @@ +package com.xjs.weixin.controller; + +import com.ruoyi.common.core.domain.R; +import com.xjs.weixin.task.WeiXinSouGouTask; +import io.swagger.annotations.Api; +import io.swagger.annotations.ApiOperation; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RestController; + +/** + * 微信搜狗controller + * @author xiejs + * @since 2022-02-22 + */ +@RestController +@RequestMapping("weixin_sougou") +@Api(tags = "爬虫模块-微信搜狗") +public class WeiXinSouGouController { + + @Autowired + private WeiXinSouGouTask weiXinSouGouTask; + + + + //----------------------远程rpc调用--------------------------- + @GetMapping("taskForPRC") + @ApiOperation("供定时任务服务RPC远程调用") + public R WeiXinSouGouTaskForPRC() { + Long count = weiXinSouGouTask.reptileWeiXinSouGou(); + return R.ok(count); + } + +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/mapper/WeiXinSouGouMapper.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/mapper/WeiXinSouGouMapper.java new file mode 100644 index 00000000..b7a27812 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/mapper/WeiXinSouGouMapper.java @@ -0,0 +1,18 @@ +package com.xjs.weixin.mapper; + +import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import com.xjs.weixin.pojo.WeiXinSouGou; + +/** + * 微信搜狗mapper + * @author xiejs + * @since 2022-02-22 + */ +public interface WeiXinSouGouMapper extends BaseMapper { + + /** + * 删除重复数据 + * @return int + */ + int deleteRepeatData(); +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/pojo/WeiXinSouGou.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/pojo/WeiXinSouGou.java new file mode 100644 index 00000000..278b2068 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/pojo/WeiXinSouGou.java @@ -0,0 +1,51 @@ +package com.xjs.weixin.pojo; + +import com.baomidou.mybatisplus.annotation.FieldFill; +import com.baomidou.mybatisplus.annotation.TableField; +import com.baomidou.mybatisplus.annotation.TableName; +import com.ruoyi.common.core.annotation.Excel; +import lombok.Data; +import lombok.experimental.Accessors; + +import java.io.Serializable; +import java.util.Date; + +/** + * 微信搜狗爬虫数据实体类 + * @author xiejs + * @since 2022-02-22 + */ +@Data +@Accessors(chain = true) +@TableName("webmagic_weixin_sougou") +public class WeiXinSouGou implements Serializable { + + private static final long serialVersionUID = 1L; + + /** 主键id */ + private Long id; + + /** 文章标题 */ + @Excel(name = "文章标题") + private String title; + + /** 简略的内容 */ + @Excel(name = "简略的内容") + private String content; + + /** 文章来源 */ + @Excel(name = "文章来源") + private String source; + + /** 文章的链接 */ + @Excel(name = "文章的链接") + private String url; + + /** 图片的链接 */ + @Excel(name = "图片的链接") + private String imgUrl; + + @Excel(name = "创建时间",dateFormat = "yyyy-MM-dd HH:mm:ss") + @TableField(fill = FieldFill.INSERT) + private Date createTime; +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/WeiXinSouGouService.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/WeiXinSouGouService.java new file mode 100644 index 00000000..a5fd4706 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/WeiXinSouGouService.java @@ -0,0 +1,18 @@ +package com.xjs.weixin.service; + +import com.baomidou.mybatisplus.extension.service.IService; +import com.xjs.weixin.pojo.WeiXinSouGou; + +/** + * 微信搜狗service接口 + * @author xiejs + * @since 2022-02-22 + */ +public interface WeiXinSouGouService extends IService { + + /** + * 删除重复数据 + * @return int + */ + int deleteRepeatData(); +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/impl/WeiXinSouGouServiceImpl.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/impl/WeiXinSouGouServiceImpl.java new file mode 100644 index 00000000..f0920a08 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/impl/WeiXinSouGouServiceImpl.java @@ -0,0 +1,26 @@ +package com.xjs.weixin.service.impl; + +import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; +import com.xjs.weixin.mapper.WeiXinSouGouMapper; +import com.xjs.weixin.pojo.WeiXinSouGou; +import com.xjs.weixin.service.WeiXinSouGouService; +import org.springframework.stereotype.Service; + +import javax.annotation.Resource; + +/** + * 微信搜狗service实现 + * @author xiejs + * @since 2022-02-22 + */ +@Service +public class WeiXinSouGouServiceImpl extends ServiceImpl implements WeiXinSouGouService { + + @Resource + private WeiXinSouGouMapper weiXinSouGouMapper; + + @Override + public int deleteRepeatData() { + return weiXinSouGouMapper.deleteRepeatData(); + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/task/WeiXinSouGouTask.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/task/WeiXinSouGouTask.java new file mode 100644 index 00000000..bc659c38 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/task/WeiXinSouGouTask.java @@ -0,0 +1,37 @@ +package com.xjs.weixin.task; + +import com.xjs.annotation.ReptileLog; +import com.xjs.weixin.service.WeiXinSouGouService; +import com.xjs.weixin.webmagic.WeiXinSouGouProcessor; +import lombok.extern.log4j.Log4j2; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import static com.xjs.consts.ReptileConst.WEIXIN_SOUGOU_URL; + +/** + * 微信搜狗任务 + * @author xiejs + * @since 2022-02-22 + */ +@Component +@Log4j2 +public class WeiXinSouGouTask { + + @Autowired + private WeiXinSouGouProcessor weiXinSouGouProcessor; + @Autowired + private WeiXinSouGouService weiXinSouGouService; + + @ReptileLog(name = "微信搜狗", url = WEIXIN_SOUGOU_URL) + public Long reptileWeiXinSouGou() { + Long run = weiXinSouGouProcessor.run(); + + //删除重复数据 + int count = weiXinSouGouService.deleteRepeatData(); + log.info("微信搜狗删除重复数据数:" + count); + + return run; + } + +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinSouGouPipeline.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinSouGouPipeline.java new file mode 100644 index 00000000..aca20e79 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinSouGouPipeline.java @@ -0,0 +1,37 @@ +package com.xjs.weixin.webmagic; + +import cn.hutool.core.collection.CollUtil; +import com.xjs.weixin.pojo.WeiXinSouGou; +import com.xjs.weixin.service.WeiXinSouGouService; +import lombok.extern.log4j.Log4j2; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.Pipeline; + +import java.util.List; + +/** + * 微信搜狗网 爬虫数据处理 + * @author xiejs + * @since 2022-02-22 + */ +@Component +@Log4j2 +public class WeiXinSouGouPipeline implements Pipeline { + + @Autowired + private WeiXinSouGouService weiXinSouGouService; + + @Override + public void process(ResultItems resultItems, Task task) { + List weiXinSouGouList =resultItems.get("weiXinSouGouList"); + + if (CollUtil.isNotEmpty(weiXinSouGouList)) { + weiXinSouGouService.saveBatch(weiXinSouGouList, 25); + } + + + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinSouGouProcessor.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinSouGouProcessor.java new file mode 100644 index 00000000..2a94bcfe --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinSouGouProcessor.java @@ -0,0 +1,167 @@ +package com.xjs.weixin.webmagic; + +import com.ruoyi.common.redis.service.RedisService; +import com.xjs.weixin.pojo.WeiXinSouGou; +import lombok.extern.log4j.Log4j2; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; +import us.codecraft.webmagic.scheduler.QueueScheduler; +import us.codecraft.webmagic.selector.Selectable; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_SOUGOU_COUNT; +import static com.xjs.consts.ReptileConst.WEIXIN_SOUGOU_URL; + +/** + * 微信搜狗网 爬虫处理 + * + * @author xiejs + * @since 2022-02-21 + */ +@Log4j2 +@Component +public class WeiXinSouGouProcessor implements PageProcessor { + + @Autowired + private WeiXinSouGouPipeline weiXinSouGouPipeline; + + private static RedisService redisService; + + @Autowired + public void setRedisService(RedisService redisService) { + WeiXinSouGouProcessor.redisService = redisService; + } + + + @Override + public void process(Page page) { + try { + //复杂度计算 + //循环次数存入redis中 + Integer count = redisService.getCacheObject(REPTILE_WEIXIN_SOUGOU_COUNT); + if (count == null) { + count = 0; + } + + List nodes = page.getHtml().css("#type_tab > .fieed-box a").nodes(); + + //1、获取需要爬取的路径 + Set set = new HashSet<>(); + for (Selectable node : nodes) { + count++; + + String s = node.get(); + + Document parse = Jsoup.parse(s); + + String id = parse.select("a").attr("id"); + + set.add(id); + } + set.removeIf(s -> !s.contains("pc_")); + List list = new ArrayList<>(); + for (String s : set) { + count++; + + s = WEIXIN_SOUGOU_URL + "/pcindex/pc/" + s + "/" + s + ".html"; + list.add(s); + } + + //2、把所有连接加入到队列 + page.addTargetRequests(list); + + //3、获取需要的参数 + List newsNodes = page.getHtml().css(".news-list > li").nodes(); + List weiXinSouGouList = new ArrayList<>(); + for (Selectable newsNode : newsNodes) { + count++; + + String s = newsNode.get(); + + //文章具体路径 + String link = newsNode.css(".img-box > a", "href").get(); + + //图片路径 + String imgSrc = newsNode.css(".img-box > a > img", "src").get(); + + //标题 + String title = newsNode.css(".txt-box > h3 > a", "text").get(); + + //省略的内容 + String content = newsNode.css(".txt-box > .txt-info", "text").get(); + + //来源 + String source = newsNode.css(".s-p > a", "text").get(); + + WeiXinSouGou weiXinSouGou = new WeiXinSouGou() + .setUrl(link) + .setImgUrl(imgSrc) + .setTitle(title) + .setContent(content) + .setSource(source); + weiXinSouGouList.add(weiXinSouGou); + } + + page.putField("weiXinSouGouList",weiXinSouGouList); + + redisService.setCacheObject(REPTILE_WEIXIN_SOUGOU_COUNT, count + 1); + } catch (Exception e) { + e.printStackTrace(); + } finally { + redisService.expire(REPTILE_WEIXIN_SOUGOU_COUNT, 3, TimeUnit.HOURS); + } + } + + @Override + public Site getSite() { + return Site.me() + //.addHeader(headerKey, headerValue) + .setCharset("utf8")//设置字符编码 + .setTimeOut(2000)//设置超时时间 + .setRetrySleepTime(100)//设置重试间隔时间 + .setCycleRetryTimes(10)//设置重试次数 + .setSleepTime(1)//设置两个页面之间的间隔时间 + ; + } + + /** + * 执行爬虫 + * + * @return 返回循环次数 + */ + public Long run() { + //执行爬虫 + Spider.create(new WeiXinSouGouProcessor()) + .addUrl(WEIXIN_SOUGOU_URL)//设置爬取地址 + .thread(30)//设置爬取线程数 + .setScheduler(new QueueScheduler() + .setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器 + .addPipeline(weiXinSouGouPipeline)//设置爬取之后的数据操作 + //.setDownloader(downloader)//设置下载器 + .run();//执行 + + //从redis中获取循环次数 + Integer cache = redisService.getCacheObject(REPTILE_WEIXIN_SOUGOU_COUNT); + redisService.deleteObject(REPTILE_WEIXIN_SOUGOU_COUNT); + + if (cache != null) { + return Long.valueOf(cache); + } + + return 0L; + } + + +} diff --git a/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/WeiXinSouGouMapper.xml b/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/WeiXinSouGouMapper.xml new file mode 100644 index 00000000..5e1ed5d5 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/WeiXinSouGouMapper.xml @@ -0,0 +1,16 @@ + + + + + + + delete from webmagic_weixin_sougou where id not in ( + SELECT + t.min_id + FROM + ( SELECT min( id ) AS min_id FROM webmagic_weixin_sougou GROUP BY url ) AS t + ) + + \ No newline at end of file diff --git a/xjs-business/xjs-business-webmagic/src/test/java/com/xjs/weixin/task/WeiXinSouGouTaskTest.java b/xjs-business/xjs-business-webmagic/src/test/java/com/xjs/weixin/task/WeiXinSouGouTaskTest.java new file mode 100644 index 00000000..65a4973d --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/test/java/com/xjs/weixin/task/WeiXinSouGouTaskTest.java @@ -0,0 +1,26 @@ +package com.xjs.weixin.task; + +import com.xjs.XjsWebmagicApp; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * @author xiejs + * @since 2022-02-22 + */ +@SpringBootTest(classes = XjsWebmagicApp.class) +class WeiXinSouGouTaskTest { + + @Autowired + WeiXinSouGouTask task; + + @Test + void reptileWeiXinSouGou() { + + Long aLong = task.reptileWeiXinSouGou(); + System.out.println(aLong); + } +} \ No newline at end of file