diff --git a/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/RemoteWebmagicWeiXinSouGouFeign.java b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/RemoteWebmagicWeiXinSouGouFeign.java index 768bcd25..7c83075e 100644 --- a/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/RemoteWebmagicWeiXinSouGouFeign.java +++ b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/RemoteWebmagicWeiXinSouGouFeign.java @@ -18,4 +18,7 @@ public interface RemoteWebmagicWeiXinSouGouFeign { @GetMapping("/weixin_sougou/taskForPRC") R WeiXinSouGouTaskForPRC() ; + + @GetMapping("/weixin_official_accounts/taskForPRC") + R WeiXinOfficialAccountsTaskForPRC() ; } diff --git a/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/factory/RemoteWebmagicWeiXinSouGouFactory.java b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/factory/RemoteWebmagicWeiXinSouGouFactory.java index 23e2c83e..c767da74 100644 --- a/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/factory/RemoteWebmagicWeiXinSouGouFactory.java +++ b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/factory/RemoteWebmagicWeiXinSouGouFactory.java @@ -22,6 +22,12 @@ public class RemoteWebmagicWeiXinSouGouFactory implements FallbackFactory -
- - - - - - - - - - - - - - - - - 搜索 - 重置 - - - - - - 新增 - - - 修改 - - - 删除 - - - 导出 - - - 刷新缓存 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - {{dict.label}} - - - - - - - - -
- - - \ No newline at end of file + + + diff --git a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java index ccc1beb6..9db69763 100644 --- a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java +++ b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java @@ -60,6 +60,16 @@ public class RedisConst { */ public static final String REPTILE_WEIXIN_LINK_COUNT = "bussiness:reptile:weixin.link.count"; + /** + * 爬虫记录循环次数常量信息:weixin.official + */ + public static final String REPTILE_WEIXIN_OFFICIAL_COUNT = "bussiness:reptile:weixin.official.count"; + + /** + * 爬虫获取微信公众号名称临时常量信息:temp:official_accounts:name + */ + public static final String REPTILE_WEIXIN_OFFICIAL_NAME = "temp:official_accounts:name"; + /** *爬虫记录循环次数常量信息:zol.phone */ diff --git a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java index c0cf2a21..1141a9d6 100644 --- a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java +++ b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java @@ -40,6 +40,9 @@ public class ReptileConst { */ public static final String WEIXIN_SOUGOU_URL= "https://weixin.sogou.com/"; + + public static final String WEIXIN_OFFCIAL_URL= "https://weixin.sogou.com/weixin?type=1&s_from=input&query="; + /** * 中关村报价url */ diff --git a/xjs-business/xjs-business-common/src/main/java/com/xjs/utils/RandomUtils.java b/xjs-business/xjs-business-common/src/main/java/com/xjs/utils/RandomUtils.java new file mode 100644 index 00000000..bd8d0074 --- /dev/null +++ b/xjs-business/xjs-business-common/src/main/java/com/xjs/utils/RandomUtils.java @@ -0,0 +1,25 @@ +package com.xjs.utils; + +import cn.hutool.core.util.RandomUtil; + +/** + * 生成随机工具类 + * + * @author xiejs + * @since 2022-06-13 + */ +public class RandomUtils { + + private static String[] zm = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"}; + + + /** + * 获取随机字母 + * @return + */ + public static String randomZm() { + int i = RandomUtil.randomInt(0, 25); + return zm[i]; + } + +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/common/util/WeiXinUtils.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/common/util/WeiXinUtils.java new file mode 100644 index 00000000..c5c46ff1 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/common/util/WeiXinUtils.java @@ -0,0 +1,89 @@ +package com.xjs.common.util; + +import java.io.*; + +/** + * 抽出相同的方法(微信爬虫) + * @author xiejs + * @since 2022-06-13 + */ +public class WeiXinUtils { + + public static String filterTitle(String title) { + //过滤title字段 + title = title.replace(" ", ""); + //替换\ 防止报错 + if (title.contains("/")) { + title = title.replace("/", "-"); + } + if (title.contains("\\")) { + title = title.replace("\\", "-"); + } + if (title.contains(":")) { + title = title.replace(":", "-"); + } + if (title.contains("*")) { + title = title.replace("*", "-"); + } + if (title.contains("?")) { + title = title.replace("?", "-"); + } + if (title.contains("\"")) { + title = title.replace("\"", "-"); + } + if (title.contains("<")) { + title = title.replace("<", "-"); + } + if (title.contains(">")) { + title = title.replace(">", "-"); + } + if (title.contains("|")) { + title = title.replace("|", "-"); + } + return title; + } + + /** + * 链接url下载图片 + * + * @param inputStream 输入流 + * @param path 磁盘地址 + * @param fileName 文件名称 + * @param title 标题名称 + * @param appendPath 拼接的地址 + */ + public static void downloadPicture(InputStream inputStream, String path, String fileName, String title,String appendPath) { + + try { + DataInputStream dataInputStream = new DataInputStream(inputStream); + + //如果文件夹不存在则创建 + File file = new File(appendPath); + + if (!file.exists()) { + boolean mkdirs = file.mkdirs(); + } + + String absolutePath = file.getAbsolutePath(); + String absolute = absolutePath + File.separator + fileName; + + FileOutputStream f = new FileOutputStream(absolute); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + + byte[] bf = new byte[1024]; + int length; + + while ((length = dataInputStream.read(bf)) > 0) { + out.write(bf, 0, length); + } + + f.write(out.toByteArray()); + dataInputStream.close(); + f.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/consts/WeiXinConst.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/consts/WeiXinConst.java index 81ea9666..76f49205 100644 --- a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/consts/WeiXinConst.java +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/consts/WeiXinConst.java @@ -17,11 +17,18 @@ public class WeiXinConst { */ public static final String REDIS_KEY = "sys_config:xjs.webmagic.wechatPicture"; + public static final String REDIS_KEY_OFFICIAL = "sys_config:xjs.webmagic.official_accounts"; + /** * 系统配置表中的key */ public static final String CONFIG_KEY = "xjs.webmagic.wechatPicture"; + + public static final String CONFIG_KEY_OFFICIAL = "xjs:webmagic:official_accounts"; + + + public static final String JPEG = "jpeg"; public static final String JPG = "jpg"; diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/OfficialAccountsController.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/OfficialAccountsController.java new file mode 100644 index 00000000..532d5fc7 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/OfficialAccountsController.java @@ -0,0 +1,33 @@ +package com.xjs.weixin.controller; + +import com.ruoyi.common.core.domain.R; +import com.xjs.weixin.task.OfficialAccountsTask; +import io.swagger.annotations.Api; +import io.swagger.annotations.ApiOperation; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RestController; + +/** + * 微信公众号controller + * @author xiejs + * @since 2022-06-13 + */ +@RestController +@RequestMapping("weixin_official_accounts") +@Api(tags = "爬虫模块-微信公众号") +public class OfficialAccountsController { + + @Autowired + private OfficialAccountsTask officialAccountsTask; + + + //----------------------远程rpc调用--------------------------- + @GetMapping("taskForPRC") + @ApiOperation("供定时任务服务RPC远程调用") + public R WeiXinOfficialAccountsTaskForPRC() { + officialAccountsTask.execute(); + return R.ok(); + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/TestController.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/TestController.java new file mode 100644 index 00000000..95f89ce6 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/controller/TestController.java @@ -0,0 +1,30 @@ +package com.xjs.weixin.controller; + +import com.xjs.weixin.task.OfficialAccountsTask; +import io.swagger.annotations.Api; +import io.swagger.annotations.ApiOperation; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RestController; + +/** + * @author xiejs + * @since 2022-06-13 + */ +@RequestMapping("test") +@RestController +@Api(tags = "测试") +public class TestController { + + @Autowired + private OfficialAccountsTask officialAccountsTask; + + + @GetMapping + @ApiOperation("微信公众号") + public String test() { + officialAccountsTask.execute(); + return "success"; + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/task/OfficialAccountsTask.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/task/OfficialAccountsTask.java new file mode 100644 index 00000000..8255cc0b --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/task/OfficialAccountsTask.java @@ -0,0 +1,118 @@ +package com.xjs.weixin.task; + +import com.ruoyi.common.core.constant.HttpStatus; +import com.ruoyi.common.core.domain.R; +import com.ruoyi.common.core.utils.StringUtils; +import com.ruoyi.common.redis.service.RedisService; +import com.ruoyi.system.api.RemoteConfigService; +import com.xjs.annotation.ReptileLog; +import com.xjs.weixin.webmagic.OfficialAccountsPipeline; +import com.xjs.weixin.webmagic.OfficialAccountsProcessor; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; +import us.codecraft.webmagic.scheduler.QueueScheduler; + +import javax.annotation.Resource; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_OFFICIAL_COUNT; +import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_OFFICIAL_NAME; +import static com.xjs.consts.ReptileConst.WEIXIN_OFFCIAL_URL; +import static com.xjs.weixin.consts.WeiXinConst.CONFIG_KEY_OFFICIAL; +import static com.xjs.weixin.consts.WeiXinConst.REDIS_KEY_OFFICIAL; + +/** + * 微信公众号定时任务 + * + * @author xiejs + * @since 2022-06-13 + */ +@Component +@SuppressWarnings("all") +public class OfficialAccountsTask { + @Autowired + private OfficialAccountsProcessor officialAccountsProcessor; + @Autowired + private RedisService redisService; + @Autowired + private OfficialAccountsPipeline officialAccountsPipeline; + @Resource + private RemoteConfigService remoteConfigService; + + //解决aop自调用不生成代理对象问题 + @Autowired + private OfficialAccountsTask officialAccountsTask; + + public void execute() { + + List names = this.convert(); + for (String name : names) { + String url = WEIXIN_OFFCIAL_URL + name; + + redisService.setCacheObject(REPTILE_WEIXIN_OFFICIAL_NAME,name); + + Long aLong = officialAccountsTask.reptileWeiXinOfficialAccount(url); + + } + + } + + @ReptileLog(name = "微信公众号") + public Long reptileWeiXinOfficialAccount(String url) { + //执行爬虫 + Spider.create(officialAccountsProcessor) + .addUrl(url)//设置爬取地址 + .thread(30)//设置爬取线程数 + .setScheduler(new QueueScheduler() + .setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器 + //.setDownloader(downloader)//设置下载器 + .addPipeline(officialAccountsPipeline)//设置爬取之后的数据操作 + .run();//同步执行 + + Integer cache = redisService.getCacheObject(REPTILE_WEIXIN_OFFICIAL_COUNT); + redisService.deleteObject(REPTILE_WEIXIN_OFFICIAL_COUNT); + if (cache != null) { + return Long.valueOf(cache); + } + return 0L; + } + + private List convert() { + String str = this.getConfigSetting(); + + if (StringUtils.isNotEmpty(str) && !str.contains(",")) { + return Arrays.asList(str); + } + + if (str.contains(",")) { + String[] split = str.split(","); + return Arrays.asList(split); + } + + + return new ArrayList<>(); + } + + /** + * 获取系统配置参数 + * + * @return str + */ + private String getConfigSetting() { + if (redisService.hasKey(REDIS_KEY_OFFICIAL)) { + return redisService.getCacheObject(REDIS_KEY_OFFICIAL); + } + + R r = remoteConfigService.getConfigKeyForRPC(CONFIG_KEY_OFFICIAL); + + if (r.getCode() == HttpStatus.SUCCESS) { + return r.getData(); + } + + return null; + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/OfficialAccountsPipeline.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/OfficialAccountsPipeline.java new file mode 100644 index 00000000..322a25c2 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/OfficialAccountsPipeline.java @@ -0,0 +1,185 @@ +package com.xjs.weixin.webmagic; + +import cn.hutool.core.collection.CollUtil; +import cn.hutool.core.date.DatePattern; +import cn.hutool.core.date.DateUtil; +import cn.hutool.core.util.RandomUtil; +import com.ruoyi.common.core.constant.HttpStatus; +import com.ruoyi.common.core.utils.StringUtils; +import com.ruoyi.common.redis.service.RedisService; +import com.ruoyi.system.api.RemoteConfigService; +import com.xjs.common.util.WeiXinUtils; +import com.xjs.utils.RandomUtils; +import com.xjs.weixin.consts.WeiXinConst; +import lombok.extern.log4j.Log4j2; +import org.apache.http.HttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.Pipeline; + +import javax.annotation.Resource; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.Date; +import java.util.List; + +import static com.xjs.weixin.consts.WeiXinConst.*; + +/** + * 微信公众号爬虫数据处理 + * + * @author xiejs + * @since 2022-06-13 + */ +@Component +@Log4j2 +@SuppressWarnings("all") +public class OfficialAccountsPipeline implements Pipeline { + @Autowired + private RedisService redisService; + @Resource + private RemoteConfigService remoteConfigService; + + + @Override + public void process(ResultItems resultItems, Task task) { + + List linkList = resultItems.get("linkList"); + String title = resultItems.get("title"); + + if (CollUtil.isNotEmpty(linkList) && StringUtils.isNotEmpty(title)) { + String appendPath = this.getAppendPath(title); + + File file = new File(appendPath); + if (file.exists()) { + return; + } + + for (String link : linkList) { + InputStream inputStream = null; + + // 创建GET请求 + CloseableHttpClient httpClient = HttpClients.createDefault(); + HttpGet httpGet = null; + try { + httpGet = new HttpGet(link); + HttpResponse response = httpClient.execute(httpGet); + if (response.getStatusLine().getStatusCode() == HttpStatus.SUCCESS) { + inputStream = response.getEntity().getContent(); + + //文件小于30kb则不写入 + long contentLength = response.getEntity().getContentLength(); + long kb = contentLength / 1024; + if (SIZE_KB > kb) { + continue; + } + + //拼接文件后缀 + String suffix; + if (link.contains(JPEG)) { + suffix = JPEG; + } else if (link.contains(JPG)) { + suffix = JPG; + } else if (link.contains(PNG)) { + suffix = PNG; + } else if (link.contains(GIF)) { + continue; + } else { + suffix = JPG; + } + + String chars = "ABCDEFGHIZKLMNOPQRSTUVWXYZ"; + char c = chars.charAt((int) (Math.random() * 1)); + + + String fileName = RandomUtils.randomZm() + RandomUtil.randomLong(100000, 1000000) + DOT + suffix; + + this.downloadPicture(inputStream, this.getPath(), fileName, title); + } + } catch (Exception e) { + e.printStackTrace(); + } finally { + try { + if (httpGet != null) { + httpGet.clone(); + } + } catch (CloneNotSupportedException e) { + e.printStackTrace(); + } + try { + httpClient.close(); + } catch (IOException e) { + e.printStackTrace(); + } + try { + if (inputStream != null) { + inputStream.close(); + } + } catch (IOException e) { + log.error(e.getMessage()); + } + } + } + } + + + } + + /** + * 链接url下载图片 + * + * @param inputStream 输入流 + * @param path 磁盘地址 + * @param fileName 文件名称 + * @param title 标题名称 + */ + private void downloadPicture(InputStream inputStream, String path, String fileName, String title) { + + WeiXinUtils.downloadPicture(inputStream, path, fileName, title, this.getAppendPath(title)); + } + + + /** + * 获取拼接后的磁盘路径 + * + * @param title 拼接的最后的文件夹 + * @return str + */ + private String getAppendPath(String title) { + title = WeiXinUtils.filterTitle(title); + + return this.getPath() + File.separator + DateUtil.format(new Date(), + DatePattern.NORM_MONTH_PATTERN) + File.separator + + DateUtil.format(new Date(), "dd") + "日" + File.separator + title; + } + + + /** + * 从缓存 -> 数据库 -> 内存 中获取磁盘地址 + * + * @return 地址 + */ + private String getPath() { + //磁盘路径 + String path; + //判断redis中是否存在 + Boolean hasKey = redisService.hasKey(REDIS_KEY); + if (hasKey) { + path = redisService.getCacheObject(REDIS_KEY); + } else { + String data = remoteConfigService.getConfigKeyForRPC(CONFIG_KEY).getData(); + if (StringUtils.isNotEmpty(data)) { + path = data; + } else { + path = WeiXinConst.PATH; + } + } + return path; + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/OfficialAccountsProcessor.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/OfficialAccountsProcessor.java new file mode 100644 index 00000000..0a3397dc --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/OfficialAccountsProcessor.java @@ -0,0 +1,155 @@ +package com.xjs.weixin.webmagic; + +import cn.hutool.core.collection.CollUtil; +import cn.hutool.core.date.DateTime; +import cn.hutool.core.date.DateUtil; +import com.ruoyi.common.core.utils.StringUtils; +import com.ruoyi.common.redis.service.RedisService; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Selectable; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_OFFICIAL_COUNT; +import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_OFFICIAL_NAME; +import static com.xjs.consts.ReptileConst.WEIXIN_SOUGOU_URL; + +/** + * 微信公众号爬虫 + * + * @author xiejs + * @since 2022-06-13 + */ +@Component +public class OfficialAccountsProcessor implements PageProcessor { + + @Autowired + private RedisService redisService; + + /** + * 请求头key + */ + private static final String headerKey = "User-Agent"; + /** + * 请求头value + */ + private static final String headerValue = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"; + + + @Override + public void process(Page page) { + + try { + Integer count = redisService.getCacheObject(REPTILE_WEIXIN_OFFICIAL_COUNT); + if (count == null) { + count = 0; + } + + List lis = page.getHtml().css(".news-box > .news-list2 > li").nodes(); + ArrayList urls = new ArrayList<>(); + for (Selectable li : lis) { + String href = li.css("dl > dd > a", "href").get(); + + Object cacheObject = redisService.getCacheObject(REPTILE_WEIXIN_OFFICIAL_NAME); + String str = (String) cacheObject; + if (StringUtils.isNotEmpty(str)) { + String text = li.css(".txt-box > .tit > a > em", "text").get(); + String textA = li.css(".txt-box > .tit > a", "text").get(); + if (StringUtils.isNotEmpty(textA)) { + continue; + } + + if (str.equals(text)) { + + //只爬取当天的文章 + String date = li.css("dl > dd > span > script").get(); + Pattern pattern = Pattern.compile("'(.*?)'"); + Matcher matcher = pattern.matcher(date); + while (matcher.find()) { + //拿到时间戳 + String word = matcher.group(1); + + DateTime dateTime = DateUtil.date(Long.parseLong(word) * 1000); + String dateStr = dateTime.toDateStr(); + + String nowDateStr = DateUtil.formatDate(new Date()); + + if (dateStr.equals(nowDateStr)) { + urls.add(WEIXIN_SOUGOU_URL + href); + } + } + } + } + redisService.deleteObject(REPTILE_WEIXIN_OFFICIAL_NAME); + + } + + page.addTargetRequests(urls); + + String js = page.getHtml().get(); + + if (js.contains("window.location.replace(url)")) { + String function = js.substring(js.indexOf("{") + 1, js.indexOf("}")); + //System.out.println("function="+function); + + //正则匹配 ' ' 里面的内容 + Pattern pattern = Pattern.compile("'(.*?)'"); + Matcher matcher = pattern.matcher(function); + StringBuilder stringBuilder = new StringBuilder(); + while (matcher.find()) { + String word = matcher.group(1); + stringBuilder.append(word); + } + page.addTargetRequests(Collections.singletonList(stringBuilder.toString())); + } + + //获取图片url + List linkList = page.getHtml().css("img", "data-src").all(); + //去空 + linkList.removeIf(StringUtils::isBlank); + + //获取标题 + String title = page.getHtml().css("#activity-name", "text").get(); + + if (StringUtils.isNotEmpty(title)) { + page.putField("title", title); + } + if (CollUtil.isNotEmpty(linkList)) { + page.putField("linkList", linkList); + } + + count = linkList.size(); + + redisService.setCacheObject(REPTILE_WEIXIN_OFFICIAL_COUNT, count); + } catch (Exception e) { + e.printStackTrace(); + } finally { + redisService.expire(REPTILE_WEIXIN_OFFICIAL_COUNT, 3, TimeUnit.HOURS); + redisService.expire(REPTILE_WEIXIN_OFFICIAL_NAME, 3, TimeUnit.HOURS); + } + + } + + @Override + public Site getSite() { + return Site.me() + //.addHeader(headerKey, headerValue) + .addHeader(headerKey, headerValue) + .setCharset("utf8")//设置字符编码 + .setTimeOut(2000)//设置超时时间 + .setRetrySleepTime(100)//设置重试间隔时间 + .setCycleRetryTimes(10)//设置重试次数 + .setSleepTime(1)//设置两个页面之间的间隔时间 + ; + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinLinkPipeline.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinLinkPipeline.java index 0322ec82..9c7a1446 100644 --- a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinLinkPipeline.java +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/weixin/webmagic/WeiXinLinkPipeline.java @@ -6,6 +6,7 @@ import com.ruoyi.common.core.constant.HttpStatus; import com.ruoyi.common.core.utils.StringUtils; import com.ruoyi.common.redis.service.RedisService; import com.ruoyi.system.api.RemoteConfigService; +import com.xjs.common.util.WeiXinUtils; import com.xjs.weixin.consts.WeiXinConst; import lombok.extern.log4j.Log4j2; import org.apache.http.HttpResponse; @@ -43,9 +44,6 @@ public class WeiXinLinkPipeline implements Pipeline { private RemoteConfigService remoteConfigService; - - - @Override public void process(ResultItems resultItems, Task task) { @@ -134,79 +132,18 @@ public class WeiXinLinkPipeline implements Pipeline { */ private void downloadPicture(InputStream inputStream, String path, String fileName, String title) { - try { - DataInputStream dataInputStream = new DataInputStream(inputStream); - - //拼接文件路径 - String appendPath = this.getAppendPath(title); - - //如果文件夹不存在则创建 - File file = new File(appendPath); - - if (!file.exists()) { - boolean mkdirs = file.mkdirs(); - } - - String absolutePath = file.getAbsolutePath(); - String absolute = absolutePath + File.separator + fileName; - - FileOutputStream f = new FileOutputStream(absolute); - - ByteArrayOutputStream out = new ByteArrayOutputStream(); - - byte[] bf = new byte[1024]; - int length; - - while ((length = dataInputStream.read(bf)) > 0) { - out.write(bf, 0, length); - } - - f.write(out.toByteArray()); - dataInputStream.close(); - f.close(); - } catch (IOException e) { - e.printStackTrace(); - } + WeiXinUtils.downloadPicture(inputStream, path, fileName, title, this.getAppendPath(title)); } /** * 获取拼接后的磁盘路径 + * * @param title 拼接的最后的文件夹 * @return str */ private String getAppendPath(String title) { - //过滤title字段 - title = title.replace(" ", ""); - //替换\ 防止报错 - if (title.contains("/")) { - title = title.replace("/", "-"); - } - if (title.contains("\\")) { - title = title.replace("\\", "-"); - } - if (title.contains(":")) { - title = title.replace(":", "-"); - } - if (title.contains("*")) { - title = title.replace("*", "-"); - } - if (title.contains("?")) { - title = title.replace("?", "-"); - } - if (title.contains("\"")) { - title = title.replace("\"", "-"); - } - if (title.contains("<")) { - title = title.replace("<", "-"); - } - if (title.contains(">")) { - title = title.replace(">", "-"); - } - if (title.contains("|")) { - title = title.replace("|", "-"); - } - + title = WeiXinUtils.filterTitle(title); return this.getPath() + File.separator + DateUtil.format(new Date(), DatePattern.NORM_MONTH_PATTERN) + File.separator