From 59400b2e3e9b562079f4d7b52e3576f6760365b9 Mon Sep 17 00:00:00 2001 From: xjs <1294405880@qq.com> Date: Fri, 18 Mar 2022 01:05:52 +0800 Subject: [PATCH] =?UTF-8?q?1=E3=80=81=E7=88=AC=E8=99=AB=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=E7=88=AC=E5=8F=96=E5=BE=AE=E4=BF=A1=E6=96=87=E7=AB=A0=E5=9B=BE?= =?UTF-8?q?=E7=89=87=E5=8A=9F=E8=83=BD=EF=BC=8C=E4=BF=9D=E5=AD=98=E5=88=B0?= =?UTF-8?q?=E7=A3=81=E7=9B=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main/java/com/xjs/consts/RedisConst.java | 15 +- .../java/com/xjs/common/aop/ApiLogAspect.java | 3 + .../com/xjs/weixin/consts/WeiXinConst.java | 34 ++++ .../controller/WeiXinLinkController.java | 37 ++++ .../controller/WeiXinSouGouController.java | 9 +- .../xjs/weixin/service/WeiXinLinkService.java | 16 ++ .../service/impl/WeiXinLinkServiceImpl.java | 25 +++ .../com/xjs/weixin/task/WeiXinLinkTask.java | 52 ++++++ .../weixin/webmagic/WeiXinLinkPipeline.java | 168 ++++++++++++++++++ .../weixin/webmagic/WeiXinLinkProcessor.java | 64 +++++++ 10 files changed, 417 insertions(+), 6 deletions(-) create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/consts/WeiXinConst.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/WeiXinLinkController.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/WeiXinLinkService.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/impl/WeiXinLinkServiceImpl.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/task/WeiXinLinkTask.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinLinkPipeline.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinLinkProcessor.java diff --git a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java index 2d16c2d6..342e20c4 100644 --- a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java +++ b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java @@ -2,6 +2,7 @@ package com.xjs.consts; /** * redis key常量 + * * @author xiejs * @since 2021-12-30 */ @@ -45,12 +46,20 @@ public class RedisConst { public static final String FORECAST_WEATHER = "weather:forecast"; /** - * 爬虫记录循环次数常量信息 + * 爬虫记录循环次数常量信息:_36wallpaper */ - public static final String REPTILE_36_WALLPAPER_COUNT= "reptile:_36wallpaper.count"; + public static final String REPTILE_36_WALLPAPER_COUNT = "reptile:_36wallpaper.count"; + /** + * 爬虫记录循环次数常量信息:weixin.sougou + */ + public static final String REPTILE_WEIXIN_SOUGOU_COUNT = "reptile:weixin.sougou.count"; - public static final String REPTILE_WEIXIN_SOUGOU_COUNT= "reptile:weixin.sougou.count"; + /** + * 爬虫记录循环次数常量信息:weixin.link + */ + public static final String REPTILE_WEIXIN_LINK_COUNT = "reptile:weixin.link.count"; + ; //-------------------有效时间----------------------- public static final Integer TRAN_DICT_EXPIRE = 1; //小时 diff --git a/xjs-business/xjs-business-openapi/src/main/java/com/xjs/common/aop/ApiLogAspect.java b/xjs-business/xjs-business-openapi/src/main/java/com/xjs/common/aop/ApiLogAspect.java index 87e3aeb1..6b3bb5cd 100644 --- a/xjs-business/xjs-business-openapi/src/main/java/com/xjs/common/aop/ApiLogAspect.java +++ b/xjs-business/xjs-business-openapi/src/main/java/com/xjs/common/aop/ApiLogAspect.java @@ -92,6 +92,9 @@ public class ApiLogAspect { if (obj instanceof String) { if (StringUtils.isNotEmpty(String.valueOf(obj))) { this.warning(between, joinPoint); + }else { + this.demoteHandle(joinPoint); + log.info("降级!调用接口耗费时间:{}ms", between); } } diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/consts/WeiXinConst.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/consts/WeiXinConst.java new file mode 100644 index 00000000..306c266c --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/consts/WeiXinConst.java @@ -0,0 +1,34 @@ +package com.xjs.weixin.consts; + +/** + * 微信爬虫常量 + * + * @author xiejs + * @since 2022-03-17 + */ +public class WeiXinConst { + /** + * 磁盘默认地址 + */ + public static final String PATH = "D:\\Dev\\WebCrawler\\Wechat"; + + /** + * redis的key + */ + public static final String REDIS_KEY = "sys_config:xjs.webmagic.wechatPicture"; + + /** + * 系统配置表中的key + */ + public static final String CONFIG_KEY = "xjs.webmagic.wechatPicture"; + + public static final String JPEG = "jpeg"; + + public static final String JPG = "jpg"; + + public static final String PNG = "png"; + + public static final String DOT = "."; + + +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/WeiXinLinkController.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/WeiXinLinkController.java new file mode 100644 index 00000000..4fcd083a --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/WeiXinLinkController.java @@ -0,0 +1,37 @@ +package com.xjs.weixin.controller; + +import com.ruoyi.common.core.web.controller.BaseController; +import com.ruoyi.common.core.web.domain.AjaxResult; +import com.ruoyi.common.security.annotation.RequiresPermissions; +import com.xjs.weixin.service.WeiXinLinkService; +import io.swagger.annotations.Api; +import io.swagger.annotations.ApiOperation; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; + +/** + * 微信文章链接控制器 + * + * @author xiejs + * @since 2022-03-17 + */ +@RestController +@RequestMapping("weixin_link") +@Api(tags = "爬虫模块-微信链接") +public class WeiXinLinkController extends BaseController { + + @Autowired + private WeiXinLinkService weiXInLinkService; + + @RequiresPermissions("webmagic:weixinlink:get") + @GetMapping("/getPicture") + @ApiOperation("获取文章图片") + public AjaxResult getPicture(@RequestParam("link") String link) { + Boolean flag = weiXInLinkService.getPicture(link); + return toAjax(flag); + } + +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/WeiXinSouGouController.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/WeiXinSouGouController.java index 3799274c..88359901 100644 --- a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/WeiXinSouGouController.java +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/WeiXinSouGouController.java @@ -3,6 +3,7 @@ package com.xjs.weixin.controller; import com.ejlchina.searcher.SearchResult; import com.ruoyi.common.core.domain.R; import com.ruoyi.common.core.utils.poi.ExcelUtil; +import com.ruoyi.common.core.web.controller.BaseController; import com.ruoyi.common.core.web.domain.AjaxResult; import com.ruoyi.common.core.web.page.PageDomain; import com.ruoyi.common.core.web.page.TableSupport; @@ -10,7 +11,6 @@ import com.ruoyi.common.log.annotation.Log; import com.ruoyi.common.log.enums.BusinessType; import com.ruoyi.common.security.annotation.RequiresPermissions; import com.xjs.validation.group.SelectGroup; -import com.xjs.web.MyBaseController; import com.xjs.weixin.pojo.WeiXinSouGou; import com.xjs.weixin.service.WeiXinSouGouService; import com.xjs.weixin.task.WeiXinSouGouTask; @@ -32,7 +32,7 @@ import java.util.List; @RestController @RequestMapping("weixin_sougou") @Api(tags = "爬虫模块-微信搜狗") -public class WeiXinSouGouController extends MyBaseController { +public class WeiXinSouGouController extends BaseController { @Autowired private WeiXinSouGouTask weiXinSouGouTask; @@ -45,6 +45,7 @@ public class WeiXinSouGouController extends MyBaseController { */ @RequiresPermissions("webmagic:weixinsougou:list") @GetMapping("/list") + @ApiOperation("查询爬虫微信搜狗搜索列表") public AjaxResult list(@Validated({SelectGroup.class}) WeiXinSouGou weiXinSouGou) { //startPage(); PageDomain pageDomain = TableSupport.buildPageRequest(); @@ -71,6 +72,7 @@ public class WeiXinSouGouController extends MyBaseController { @RequiresPermissions("webmagic:weixinsougou:export") @Log(title = "微信搜狗", businessType = BusinessType.EXPORT) @PostMapping("/export") + @ApiOperation("导出爬虫微信搜狗搜索列表") public void export(HttpServletResponse response, WeiXinSouGou weiXinSouGou) { List list = weiXinSouGouService.selectWeiXinSouGouList(weiXinSouGou); ExcelUtil util = new ExcelUtil<>(WeiXinSouGou.class); @@ -81,8 +83,9 @@ public class WeiXinSouGouController extends MyBaseController { * 删除爬虫微信搜狗搜索 */ @RequiresPermissions("webmagic:weixinsougou:remove") - @Log(title = "爬虫微信搜狗搜索", businessType = BusinessType.DELETE) + @Log(title = "微信搜狗", businessType = BusinessType.DELETE) @DeleteMapping("/{ids}") + @ApiOperation("爬虫微信搜狗搜索") public AjaxResult remove(@PathVariable Long[] ids) { return toAjax(weiXinSouGouService.deleteWeiXinSouGouByIds(ids)); } diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/WeiXinLinkService.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/WeiXinLinkService.java new file mode 100644 index 00000000..56f70a84 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/WeiXinLinkService.java @@ -0,0 +1,16 @@ +package com.xjs.weixin.service; + +/** + * 微信文章链接service接口 + * @author xiejs + * @since 2022-03-17 + */ +public interface WeiXinLinkService { + + /** + * 爬虫获取微信文章图片 + * @return 布尔 + * @param link 链接地址 + */ + Boolean getPicture(String link); +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/impl/WeiXinLinkServiceImpl.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/impl/WeiXinLinkServiceImpl.java new file mode 100644 index 00000000..6c3ba26a --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/service/impl/WeiXinLinkServiceImpl.java @@ -0,0 +1,25 @@ +package com.xjs.weixin.service.impl; + +import com.xjs.weixin.service.WeiXinLinkService; +import com.xjs.weixin.task.WeiXinLinkTask; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; + +/** + * 微信文章链接service接口实现 + * + * @author xiejs + * @since 2022-03-17 + */ +@Service +public class WeiXinLinkServiceImpl implements WeiXinLinkService { + + @Autowired + private WeiXinLinkTask weiXinLinkTask; + + @Override + public Boolean getPicture(String link) { + Long count = weiXinLinkTask.reptileWeiXinLink(link); + return count != 0L; + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/task/WeiXinLinkTask.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/task/WeiXinLinkTask.java new file mode 100644 index 00000000..3d3e32e2 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/task/WeiXinLinkTask.java @@ -0,0 +1,52 @@ +package com.xjs.weixin.task; + +import com.ruoyi.common.redis.service.RedisService; +import com.xjs.annotation.ReptileLog; +import com.xjs.weixin.webmagic.WeiXinLinkPipeline; +import com.xjs.weixin.webmagic.WeiXinLinkProcessor; +import lombok.extern.log4j.Log4j2; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; +import us.codecraft.webmagic.scheduler.QueueScheduler; + +import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_LINK_COUNT; + +/** + * 微信链接爬虫任务 + * @author xiejs + * @since 2022-03-17 + */ +@Component +@Log4j2 +public class WeiXinLinkTask { + + @Autowired + private WeiXinLinkProcessor weiXinLinkProcessor; + @Autowired + private WeiXinLinkPipeline weiXinLinkPipeline; + @Autowired + private RedisService redisService; + + @ReptileLog(name = "微信链接", url = "###") + public Long reptileWeiXinLink(String link) { + //执行爬虫 + Spider.create(weiXinLinkProcessor) + .addUrl(link)//设置爬取地址 + .thread(30)//设置爬取线程数 + .setScheduler(new QueueScheduler() + .setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器 + .addPipeline(weiXinLinkPipeline)//设置爬取之后的数据操作 + //.setDownloader(downloader)//设置下载器 + .run();//执行 + + Integer cache = redisService.getCacheObject(REPTILE_WEIXIN_LINK_COUNT); + redisService.deleteObject(REPTILE_WEIXIN_LINK_COUNT); + if (cache != null) { + return Long.valueOf(cache); + } + return 0L; + } + +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinLinkPipeline.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinLinkPipeline.java new file mode 100644 index 00000000..758c251b --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinLinkPipeline.java @@ -0,0 +1,168 @@ +package com.xjs.weixin.webmagic; + +import cn.hutool.core.date.DatePattern; +import cn.hutool.core.date.DateUtil; +import com.ruoyi.common.core.constant.HttpStatus; +import com.ruoyi.common.core.utils.StringUtils; +import com.ruoyi.common.redis.service.RedisService; +import com.ruoyi.system.api.RemoteConfigService; +import com.xjs.weixin.consts.WeiXinConst; +import lombok.extern.log4j.Log4j2; +import org.apache.http.HttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.Pipeline; + +import javax.annotation.Resource; +import java.io.*; +import java.util.Date; +import java.util.List; +import java.util.UUID; + +import static com.xjs.weixin.consts.WeiXinConst.*; + + +/** + * 微信链接爬虫数据处理 + * + * @author xiejs + * @since 2022-03-17 + */ +@Component +@Log4j2 +public class WeiXinLinkPipeline implements Pipeline { + + @Autowired + private RedisService redisService; + @Resource + private RemoteConfigService remoteConfigService; + + + @Override + public void process(ResultItems resultItems, Task task) { + + + List linkList = resultItems.get("linkList"); + for (String link : linkList) { + + // 创建GET请求 + CloseableHttpClient httpClient = HttpClients.createDefault(); + HttpGet httpGet = null; + InputStream inputStream = null; + try { + httpGet = new HttpGet(link); + HttpResponse response = httpClient.execute(httpGet); + if (response.getStatusLine().getStatusCode() == HttpStatus.SUCCESS) { + inputStream = response.getEntity().getContent(); + + String suffix; + if (link.contains(JPEG)) { + suffix = JPEG; + } else if (link.contains(JPG)) { + suffix = JPG; + } else if (link.contains(PNG)) { + suffix = PNG; + } else { + suffix = JPG; + } + + String fileName = UUID.randomUUID() + DOT + suffix; + + this.downloadPicture(inputStream, getPath(), fileName); + } + } catch (Exception e) { + e.printStackTrace(); + } + finally { + try { + if (httpGet != null) { + httpGet.clone(); + } + } catch (CloneNotSupportedException e) { + e.printStackTrace(); + } + try { + httpClient.close(); + } catch (IOException e) { + e.printStackTrace(); + } + try { + if (inputStream != null) { + inputStream.close(); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + + } + + //链接url下载图片 + private void downloadPicture(InputStream inputStream, String path, String fileName) { + try { + DataInputStream dataInputStream = new DataInputStream(inputStream); + + //拼接文件路径 + String newPath=path+ File.separator+DateUtil.format(new Date(), DatePattern.NORM_MONTH_PATTERN)+File.separator + +DateUtil.format(new Date(), "dd")+"日"; + + //如果文件夹不存在则创建 + File file = new File(newPath); + if (!file.exists()) { + file.mkdirs(); + } + + String absolutePath = file.getAbsolutePath(); + String absolute = absolutePath + File.separator + fileName; + + FileOutputStream f = new FileOutputStream(absolute); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + + byte[] buffer = new byte[1024]; + int length; + + while ((length = dataInputStream.read(buffer)) > 0) { + out.write(buffer, 0, length); + } + + f.write(out.toByteArray()); + dataInputStream.close(); + f.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + + /** + * 从缓存 -> 数据库 -> 内存 中获取磁盘地址 + * + * @return 地址 + */ + private String getPath() { + //磁盘路径 + String path; + //判断redis中是否存在 + Boolean hasKey = redisService.hasKey(REDIS_KEY); + if (hasKey) { + path = redisService.getCacheObject(REDIS_KEY); + } else { + String data = remoteConfigService.getConfigKeyForRPC(CONFIG_KEY).getData(); + if (StringUtils.isNotEmpty(data)) { + path = data; + } else { + path = WeiXinConst.PATH; + } + } + return path; + } + + +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinLinkProcessor.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinLinkProcessor.java new file mode 100644 index 00000000..0df79471 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinLinkProcessor.java @@ -0,0 +1,64 @@ +package com.xjs.weixin.webmagic; + +import com.ruoyi.common.redis.service.RedisService; +import lombok.extern.log4j.Log4j2; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; +import java.util.concurrent.TimeUnit; + +import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_LINK_COUNT; + +/** + * 微信链接爬虫 + * + * @author xiejs + * @since 2022-03-17 + */ +@Component +@Log4j2 +public class WeiXinLinkProcessor implements PageProcessor { + + @Autowired + private RedisService redisService; + + @Override + public void process(Page page) { + try { + Integer count = redisService.getCacheObject(REPTILE_WEIXIN_LINK_COUNT); + if (count == null) { + count = 0; + } + + List linkList = page.getHtml().css("section > section > img", "data-src").all(); + + page.putField("linkList",linkList); + + log.info("linkList----{}",linkList); + + count= linkList.size(); + + redisService.setCacheObject(REPTILE_WEIXIN_LINK_COUNT, count ); + } catch (Exception e) { + log.error(e.getMessage()); + } finally { + redisService.expire(REPTILE_WEIXIN_LINK_COUNT, 3, TimeUnit.HOURS); + } + } + + @Override + public Site getSite() { + return Site.me() + //.addHeader(headerKey, headerValue) + .setCharset("utf8")//设置字符编码 + .setTimeOut(2000)//设置超时时间 + .setRetrySleepTime(100)//设置重试间隔时间 + .setCycleRetryTimes(10)//设置重试次数 + .setSleepTime(1)//设置两个页面之间的间隔时间 + ; + } +}