1、爬虫实现爬取微信文章图片功能,保存到磁盘

pull/254/head
xjs 4 years ago
parent 366685379a
commit 59400b2e3e

@ -2,6 +2,7 @@ package com.xjs.consts;
/** /**
* redis key * redis key
*
* @author xiejs * @author xiejs
* @since 2021-12-30 * @since 2021-12-30
*/ */
@ -45,12 +46,20 @@ public class RedisConst {
public static final String FORECAST_WEATHER = "weather:forecast"; public static final String FORECAST_WEATHER = "weather:forecast";
/** /**
* * _36wallpaper
*/ */
public static final String REPTILE_36_WALLPAPER_COUNT= "reptile:_36wallpaper.count"; public static final String REPTILE_36_WALLPAPER_COUNT = "reptile:_36wallpaper.count";
/**
* weixin.sougou
*/
public static final String REPTILE_WEIXIN_SOUGOU_COUNT = "reptile:weixin.sougou.count";
public static final String REPTILE_WEIXIN_SOUGOU_COUNT= "reptile:weixin.sougou.count"; /**
* weixin.link
*/
public static final String REPTILE_WEIXIN_LINK_COUNT = "reptile:weixin.link.count";
;
//-------------------有效时间----------------------- //-------------------有效时间-----------------------
public static final Integer TRAN_DICT_EXPIRE = 1; //小时 public static final Integer TRAN_DICT_EXPIRE = 1; //小时

@ -92,6 +92,9 @@ public class ApiLogAspect {
if (obj instanceof String) { if (obj instanceof String) {
if (StringUtils.isNotEmpty(String.valueOf(obj))) { if (StringUtils.isNotEmpty(String.valueOf(obj))) {
this.warning(between, joinPoint); this.warning(between, joinPoint);
}else {
this.demoteHandle(joinPoint);
log.info("降级!调用接口耗费时间:{}ms", between);
} }
} }

@ -0,0 +1,34 @@
package com.xjs.weixin.consts;
/**
*
*
* @author xiejs
* @since 2022-03-17
*/
public class WeiXinConst {
/**
*
*/
public static final String PATH = "D:\\Dev\\WebCrawler\\Wechat";
/**
* rediskey
*/
public static final String REDIS_KEY = "sys_config:xjs.webmagic.wechatPicture";
/**
* key
*/
public static final String CONFIG_KEY = "xjs.webmagic.wechatPicture";
public static final String JPEG = "jpeg";
public static final String JPG = "jpg";
public static final String PNG = "png";
public static final String DOT = ".";
}

@ -0,0 +1,37 @@
package com.xjs.weixin.controller;
import com.ruoyi.common.core.web.controller.BaseController;
import com.ruoyi.common.core.web.domain.AjaxResult;
import com.ruoyi.common.security.annotation.RequiresPermissions;
import com.xjs.weixin.service.WeiXinLinkService;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
/**
*
*
* @author xiejs
* @since 2022-03-17
*/
@RestController
@RequestMapping("weixin_link")
@Api(tags = "爬虫模块-微信链接")
public class WeiXinLinkController extends BaseController {
@Autowired
private WeiXinLinkService weiXInLinkService;
@RequiresPermissions("webmagic:weixinlink:get")
@GetMapping("/getPicture")
@ApiOperation("获取文章图片")
public AjaxResult getPicture(@RequestParam("link") String link) {
Boolean flag = weiXInLinkService.getPicture(link);
return toAjax(flag);
}
}

@ -3,6 +3,7 @@ package com.xjs.weixin.controller;
import com.ejlchina.searcher.SearchResult; import com.ejlchina.searcher.SearchResult;
import com.ruoyi.common.core.domain.R; import com.ruoyi.common.core.domain.R;
import com.ruoyi.common.core.utils.poi.ExcelUtil; import com.ruoyi.common.core.utils.poi.ExcelUtil;
import com.ruoyi.common.core.web.controller.BaseController;
import com.ruoyi.common.core.web.domain.AjaxResult; import com.ruoyi.common.core.web.domain.AjaxResult;
import com.ruoyi.common.core.web.page.PageDomain; import com.ruoyi.common.core.web.page.PageDomain;
import com.ruoyi.common.core.web.page.TableSupport; import com.ruoyi.common.core.web.page.TableSupport;
@ -10,7 +11,6 @@ import com.ruoyi.common.log.annotation.Log;
import com.ruoyi.common.log.enums.BusinessType; import com.ruoyi.common.log.enums.BusinessType;
import com.ruoyi.common.security.annotation.RequiresPermissions; import com.ruoyi.common.security.annotation.RequiresPermissions;
import com.xjs.validation.group.SelectGroup; import com.xjs.validation.group.SelectGroup;
import com.xjs.web.MyBaseController;
import com.xjs.weixin.pojo.WeiXinSouGou; import com.xjs.weixin.pojo.WeiXinSouGou;
import com.xjs.weixin.service.WeiXinSouGouService; import com.xjs.weixin.service.WeiXinSouGouService;
import com.xjs.weixin.task.WeiXinSouGouTask; import com.xjs.weixin.task.WeiXinSouGouTask;
@ -32,7 +32,7 @@ import java.util.List;
@RestController @RestController
@RequestMapping("weixin_sougou") @RequestMapping("weixin_sougou")
@Api(tags = "爬虫模块-微信搜狗") @Api(tags = "爬虫模块-微信搜狗")
public class WeiXinSouGouController extends MyBaseController { public class WeiXinSouGouController extends BaseController {
@Autowired @Autowired
private WeiXinSouGouTask weiXinSouGouTask; private WeiXinSouGouTask weiXinSouGouTask;
@ -45,6 +45,7 @@ public class WeiXinSouGouController extends MyBaseController {
*/ */
@RequiresPermissions("webmagic:weixinsougou:list") @RequiresPermissions("webmagic:weixinsougou:list")
@GetMapping("/list") @GetMapping("/list")
@ApiOperation("查询爬虫微信搜狗搜索列表")
public AjaxResult list(@Validated({SelectGroup.class}) WeiXinSouGou weiXinSouGou) { public AjaxResult list(@Validated({SelectGroup.class}) WeiXinSouGou weiXinSouGou) {
//startPage(); //startPage();
PageDomain pageDomain = TableSupport.buildPageRequest(); PageDomain pageDomain = TableSupport.buildPageRequest();
@ -71,6 +72,7 @@ public class WeiXinSouGouController extends MyBaseController {
@RequiresPermissions("webmagic:weixinsougou:export") @RequiresPermissions("webmagic:weixinsougou:export")
@Log(title = "微信搜狗", businessType = BusinessType.EXPORT) @Log(title = "微信搜狗", businessType = BusinessType.EXPORT)
@PostMapping("/export") @PostMapping("/export")
@ApiOperation("导出爬虫微信搜狗搜索列表")
public void export(HttpServletResponse response, WeiXinSouGou weiXinSouGou) { public void export(HttpServletResponse response, WeiXinSouGou weiXinSouGou) {
List<WeiXinSouGou> list = weiXinSouGouService.selectWeiXinSouGouList(weiXinSouGou); List<WeiXinSouGou> list = weiXinSouGouService.selectWeiXinSouGouList(weiXinSouGou);
ExcelUtil<WeiXinSouGou> util = new ExcelUtil<>(WeiXinSouGou.class); ExcelUtil<WeiXinSouGou> util = new ExcelUtil<>(WeiXinSouGou.class);
@ -81,8 +83,9 @@ public class WeiXinSouGouController extends MyBaseController {
* *
*/ */
@RequiresPermissions("webmagic:weixinsougou:remove") @RequiresPermissions("webmagic:weixinsougou:remove")
@Log(title = "爬虫微信搜狗搜索", businessType = BusinessType.DELETE) @Log(title = "微信搜狗", businessType = BusinessType.DELETE)
@DeleteMapping("/{ids}") @DeleteMapping("/{ids}")
@ApiOperation("爬虫微信搜狗搜索")
public AjaxResult remove(@PathVariable Long[] ids) { public AjaxResult remove(@PathVariable Long[] ids) {
return toAjax(weiXinSouGouService.deleteWeiXinSouGouByIds(ids)); return toAjax(weiXinSouGouService.deleteWeiXinSouGouByIds(ids));
} }

@ -0,0 +1,16 @@
package com.xjs.weixin.service;
/**
* service
* @author xiejs
* @since 2022-03-17
*/
public interface WeiXinLinkService {
/**
*
* @return
* @param link
*/
Boolean getPicture(String link);
}

@ -0,0 +1,25 @@
package com.xjs.weixin.service.impl;
import com.xjs.weixin.service.WeiXinLinkService;
import com.xjs.weixin.task.WeiXinLinkTask;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
/**
* service
*
* @author xiejs
* @since 2022-03-17
*/
@Service
public class WeiXinLinkServiceImpl implements WeiXinLinkService {
@Autowired
private WeiXinLinkTask weiXinLinkTask;
@Override
public Boolean getPicture(String link) {
Long count = weiXinLinkTask.reptileWeiXinLink(link);
return count != 0L;
}
}

@ -0,0 +1,52 @@
package com.xjs.weixin.task;
import com.ruoyi.common.redis.service.RedisService;
import com.xjs.annotation.ReptileLog;
import com.xjs.weixin.webmagic.WeiXinLinkPipeline;
import com.xjs.weixin.webmagic.WeiXinLinkProcessor;
import lombok.extern.log4j.Log4j2;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_LINK_COUNT;
/**
*
* @author xiejs
* @since 2022-03-17
*/
@Component
@Log4j2
public class WeiXinLinkTask {
@Autowired
private WeiXinLinkProcessor weiXinLinkProcessor;
@Autowired
private WeiXinLinkPipeline weiXinLinkPipeline;
@Autowired
private RedisService redisService;
@ReptileLog(name = "微信链接", url = "###")
public Long reptileWeiXinLink(String link) {
//执行爬虫
Spider.create(weiXinLinkProcessor)
.addUrl(link)//设置爬取地址
.thread(30)//设置爬取线程数
.setScheduler(new QueueScheduler()
.setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器
.addPipeline(weiXinLinkPipeline)//设置爬取之后的数据操作
//.setDownloader(downloader)//设置下载器
.run();//执行
Integer cache = redisService.getCacheObject(REPTILE_WEIXIN_LINK_COUNT);
redisService.deleteObject(REPTILE_WEIXIN_LINK_COUNT);
if (cache != null) {
return Long.valueOf(cache);
}
return 0L;
}
}

@ -0,0 +1,168 @@
package com.xjs.weixin.webmagic;
import cn.hutool.core.date.DatePattern;
import cn.hutool.core.date.DateUtil;
import com.ruoyi.common.core.constant.HttpStatus;
import com.ruoyi.common.core.utils.StringUtils;
import com.ruoyi.common.redis.service.RedisService;
import com.ruoyi.system.api.RemoteConfigService;
import com.xjs.weixin.consts.WeiXinConst;
import lombok.extern.log4j.Log4j2;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import javax.annotation.Resource;
import java.io.*;
import java.util.Date;
import java.util.List;
import java.util.UUID;
import static com.xjs.weixin.consts.WeiXinConst.*;
/**
*
*
* @author xiejs
* @since 2022-03-17
*/
@Component
@Log4j2
public class WeiXinLinkPipeline implements Pipeline {
@Autowired
private RedisService redisService;
@Resource
private RemoteConfigService remoteConfigService;
@Override
public void process(ResultItems resultItems, Task task) {
List<String> linkList = resultItems.get("linkList");
for (String link : linkList) {
// 创建GET请求
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = null;
InputStream inputStream = null;
try {
httpGet = new HttpGet(link);
HttpResponse response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == HttpStatus.SUCCESS) {
inputStream = response.getEntity().getContent();
String suffix;
if (link.contains(JPEG)) {
suffix = JPEG;
} else if (link.contains(JPG)) {
suffix = JPG;
} else if (link.contains(PNG)) {
suffix = PNG;
} else {
suffix = JPG;
}
String fileName = UUID.randomUUID() + DOT + suffix;
this.downloadPicture(inputStream, getPath(), fileName);
}
} catch (Exception e) {
e.printStackTrace();
}
finally {
try {
if (httpGet != null) {
httpGet.clone();
}
} catch (CloneNotSupportedException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
if (inputStream != null) {
inputStream.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
//链接url下载图片
private void downloadPicture(InputStream inputStream, String path, String fileName) {
try {
DataInputStream dataInputStream = new DataInputStream(inputStream);
//拼接文件路径
String newPath=path+ File.separator+DateUtil.format(new Date(), DatePattern.NORM_MONTH_PATTERN)+File.separator
+DateUtil.format(new Date(), "dd")+"日";
//如果文件夹不存在则创建
File file = new File(newPath);
if (!file.exists()) {
file.mkdirs();
}
String absolutePath = file.getAbsolutePath();
String absolute = absolutePath + File.separator + fileName;
FileOutputStream f = new FileOutputStream(absolute);
ByteArrayOutputStream out = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int length;
while ((length = dataInputStream.read(buffer)) > 0) {
out.write(buffer, 0, length);
}
f.write(out.toByteArray());
dataInputStream.close();
f.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* -> ->
*
* @return
*/
private String getPath() {
//磁盘路径
String path;
//判断redis中是否存在
Boolean hasKey = redisService.hasKey(REDIS_KEY);
if (hasKey) {
path = redisService.getCacheObject(REDIS_KEY);
} else {
String data = remoteConfigService.getConfigKeyForRPC(CONFIG_KEY).getData();
if (StringUtils.isNotEmpty(data)) {
path = data;
} else {
path = WeiXinConst.PATH;
}
}
return path;
}
}

@ -0,0 +1,64 @@
package com.xjs.weixin.webmagic;
import com.ruoyi.common.redis.service.RedisService;
import lombok.extern.log4j.Log4j2;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
import java.util.concurrent.TimeUnit;
import static com.xjs.consts.RedisConst.REPTILE_WEIXIN_LINK_COUNT;
/**
*
*
* @author xiejs
* @since 2022-03-17
*/
@Component
@Log4j2
public class WeiXinLinkProcessor implements PageProcessor {
@Autowired
private RedisService redisService;
@Override
public void process(Page page) {
try {
Integer count = redisService.getCacheObject(REPTILE_WEIXIN_LINK_COUNT);
if (count == null) {
count = 0;
}
List<String> linkList = page.getHtml().css("section > section > img", "data-src").all();
page.putField("linkList",linkList);
log.info("linkList----{}",linkList);
count= linkList.size();
redisService.setCacheObject(REPTILE_WEIXIN_LINK_COUNT, count );
} catch (Exception e) {
log.error(e.getMessage());
} finally {
redisService.expire(REPTILE_WEIXIN_LINK_COUNT, 3, TimeUnit.HOURS);
}
}
@Override
public Site getSite() {
return Site.me()
//.addHeader(headerKey, headerValue)
.setCharset("utf8")//设置字符编码
.setTimeOut(2000)//设置超时时间
.setRetrySleepTime(100)//设置重试间隔时间
.setCycleRetryTimes(10)//设置重试次数
.setSleepTime(1)//设置两个页面之间的间隔时间
;
}
}
Loading…
Cancel
Save