1、36壁纸网爬虫配置参数实现页面可以自行配置

pull/254/head
xjs 4 years ago
parent 7c2e1b684d
commit f2a6a46d29

@ -0,0 +1,26 @@
package com.ruoyi.system.api;
import com.ruoyi.common.core.constant.ServiceNameConstants;
import com.ruoyi.common.core.domain.R;
import com.ruoyi.system.api.factory.RemoteConfigFallbackFactory;
import org.springframework.cloud.openfeign.FeignClient;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
/**
*
* @author xiejs
* @since 2022-02-20
*/
@FeignClient(contextId = "remoteConfigService",
value = ServiceNameConstants.SYSTEM_SERVICE,
fallbackFactory = RemoteConfigFallbackFactory.class)
public interface RemoteConfigService {
/**
*
*/
@GetMapping(value = "/config/configKeyForRPC/{configKey}")
R<String> getConfigKeyForRPC(@PathVariable("configKey") String configKey);
}

@ -0,0 +1,26 @@
package com.ruoyi.system.api.factory;
import com.ruoyi.common.core.domain.R;
import com.ruoyi.system.api.RemoteConfigService;
import lombok.extern.log4j.Log4j2;
import org.springframework.cloud.openfeign.FallbackFactory;
import org.springframework.stereotype.Component;
/**
* @author xiejs
* @since 2022-02-20
*/
@Component
@Log4j2
public class RemoteConfigFallbackFactory implements FallbackFactory<RemoteConfigService> {
@Override
public RemoteConfigService create(Throwable cause) {
return new RemoteConfigService() {
@Override
public R<String> getConfigKeyForRPC(String configKey) {
log.error("系统配置服务调用失败:{}", cause.getMessage());
return R.fail("系统配置服务调用失败");
}
};
}
}

@ -1,4 +1,5 @@
org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
com.ruoyi.system.api.factory.RemoteUserFallbackFactory,\
com.ruoyi.system.api.factory.RemoteLogFallbackFactory, \
com.ruoyi.system.api.factory.RemoteFileFallbackFactory
com.ruoyi.system.api.factory.RemoteFileFallbackFactory,\
com.ruoyi.system.api.factory.RemoteConfigFallbackFactory

@ -1,18 +1,7 @@
package com.ruoyi.system.controller;
import java.util.List;
import javax.servlet.http.HttpServletResponse;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.validation.annotation.Validated;
import org.springframework.web.bind.annotation.DeleteMapping;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.PutMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import com.ruoyi.common.core.constant.UserConstants;
import com.ruoyi.common.core.domain.R;
import com.ruoyi.common.core.utils.poi.ExcelUtil;
import com.ruoyi.common.core.web.controller.BaseController;
import com.ruoyi.common.core.web.domain.AjaxResult;
@ -23,6 +12,12 @@ import com.ruoyi.common.security.annotation.RequiresPermissions;
import com.ruoyi.common.security.utils.SecurityUtils;
import com.ruoyi.system.domain.SysConfig;
import com.ruoyi.system.service.ISysConfigService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.validation.annotation.Validated;
import org.springframework.web.bind.annotation.*;
import javax.servlet.http.HttpServletResponse;
import java.util.List;
/**
*
@ -76,6 +71,16 @@ public class SysConfigController extends BaseController
return AjaxResult.success(configService.selectConfigByKey(configKey));
}
/**
* forRpc
* @since 2022-02-20
* @Author xjs
*/
@GetMapping(value = "/configKeyForRPC/{configKey}")
public R<String> getConfigKeyForRPC(@PathVariable String configKey) {
return R.ok(configService.selectConfigByKey(configKey));
}
/**
*
*/

@ -47,7 +47,7 @@ public class RedisConst {
/**
*
*/
public static final String REPTILE_COUNT= "reptile:count";
public static final String REPTILE_COUNT= "reptile:_36wallpaper.count";
//-------------------有效时间-----------------------
public static final Integer TRAN_DICT_EXPIRE = 1; //小时

@ -1,6 +1,5 @@
package com.xjs._36wallpaper.task;
import com.xjs._36wallpaper.service._36wallpaperService;
import com.xjs._36wallpaper.webmagic._36wallpaperProcessor;
import com.xjs.annotation.ReptileLog;
import lombok.extern.log4j.Log4j2;
@ -21,8 +20,6 @@ public class _36wallpaperTask {
@Autowired
private _36wallpaperProcessor wallpaperProcessor;
@Autowired
private _36wallpaperService wallpaperService;
/**
@ -31,12 +28,7 @@ public class _36wallpaperTask {
*/
@ReptileLog(name = "36壁纸网", url = _36_WALLPAPER_URL)
public Long reptileWallpaper() {
Long run = wallpaperProcessor.run();
//删除重复数据
int count = wallpaperService.deleteRepeatData();
log.info("36壁纸删除重复数据数" + count);
return run;
return wallpaperProcessor.run();
}
}

@ -1,7 +1,5 @@
package com.xjs._36wallpaper.webmagic;
import cn.hutool.core.collection.CollUtil;
import com.xjs._36wallpaper.pojo._36wallpaper;
import com.xjs._36wallpaper.service._36wallpaperService;
import lombok.extern.log4j.Log4j2;
import org.springframework.beans.factory.annotation.Autowired;
@ -10,8 +8,6 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.List;
/**
* webmagic
* @author xiejs
@ -27,11 +23,13 @@ public class _36wallpaperPipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
List<_36wallpaper> wallpaperData = resultItems.get("_36wallpaperData");
//这种方法效率低
/*List<_36wallpaper> wallpaperData = resultItems.get("_36wallpaperData");
if (CollUtil.isNotEmpty(wallpaperData)) {
wallpaperService.saveBatch(wallpaperData, 25);
}
}*/
}
}

@ -1,8 +1,12 @@
package com.xjs._36wallpaper.webmagic;
import com.alibaba.fastjson.JSONObject;
import com.ruoyi.common.redis.service.RedisService;
import com.ruoyi.system.api.RemoteConfigService;
import com.xjs._36wallpaper.pojo._36wallpaper;
import com.xjs._36wallpaper.service._36wallpaperService;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.beans.factory.annotation.Autowired;
@ -32,17 +36,51 @@ import static com.xjs.consts.ReptileConst._36_WALLPAPER_URL;
*/
@Log4j2
@Component
public class _36wallpaperProcessor implements PageProcessor {
/**
*
*/
private static boolean init = false;
private boolean init = false;
/**
*
*/
private boolean downloadImg = false;
/**
*
*/
private String path = "D:\\Dev\\WebCrawler\\36wallpaper";
/**
* rediskey
*/
public static final String REDIS_KEY = "sys_config:xjs.webmagic._36wallpaper";
/**
* key
*/
public static final String CONFIG_KEY = "xjs.webmagic._36wallpaper";
/**
* key
*/
private static final String headerKey = "User-Agent";
/**
* value
*/
private static final String headerValue = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36";
private static RemoteConfigService remoteConfigService;
@Autowired
public void setRemoteConfigService(RemoteConfigService remoteConfigService) {
_36wallpaperProcessor.remoteConfigService = remoteConfigService;
}
@Autowired
private _36wallpaperPipeline wallpaperPipeline;
@ -54,31 +92,46 @@ public class _36wallpaperProcessor implements PageProcessor {
}
/*private static _36wallpaperService wallpaperService;
private static _36wallpaperService wallpaperService;
@Autowired
public void setWallpaperService(_36wallpaperService wallpaperService) {
_36wallpaperProcessor.wallpaperService = wallpaperService;
}*/
}
/**
*
*
*/
private static final String path = "D:\\Dev\\WebCrawler\\36wallpaper";
private void initParameter() {
//判断redis中是否存在
Boolean hasKey = redisService.hasKey(REDIS_KEY);
if (hasKey) {
String cacheObject = redisService.getCacheObject(REDIS_KEY);
JSONObject json = JSONObject.parseObject(cacheObject);
this.init = json.getBoolean("init");
this.downloadImg = json.getBoolean("downloadImg");
this.path = json.getString("path");
} else if (StringUtils.isNotEmpty(remoteConfigService.getConfigKeyForRPC(CONFIG_KEY).getData())) {
String data = remoteConfigService.getConfigKeyForRPC(CONFIG_KEY).getData();
JSONObject json = JSONObject.parseObject(data);
this.init = json.getBoolean("init");
this.downloadImg = json.getBoolean("downloadImg");
this.path = json.getString("path");
}
}
private Site site = Site.me()
.addHeader(headerKey,headerValue)
.setCharset("utf8")//设置字符编码
.setTimeOut(2000)//设置超时时间
.setRetrySleepTime(200)//设置重试间隔时间
.setCycleRetryTimes(6)//设置重试次数
.setSleepTime(1)//设置两个页面之间的间隔时间
;
//解析页面
/**
*
*
* @param page
*/
@Override
public void process(Page page) {
//初始化配置 (放在此处而不放在run方法原因每次执行该方法都是创建线程拿到当前的类属性不一致)
initParameter();
//解析返回的数据page,并且把解析的结果放到ResultItems中
/*//第一种写法:css选择器
@ -136,10 +189,12 @@ public class _36wallpaperProcessor implements PageProcessor {
//保存到磁盘
if (downloadImg) {
if (link != null) {
String thisPath = path + File.separator + title;
downloadPicture(link, thisPath, pictureName + ".jpg");
}
}
//爬取图片标签
List<String> tagList = body.css(".showcontw > .showtaglistw a").all();
@ -162,10 +217,10 @@ public class _36wallpaperProcessor implements PageProcessor {
}
//持久化 --使用Pipeline实现持久化了
//wallpaperService.saveBatch(wallpapers, 25);
wallpaperService.saveBatch(wallpapers, 25);
//暂时保存到内存中后续实现Pipeline接口保存到数据库
page.putField("_36wallpaperData",wallpapers);
//暂时保存到内存中后续实现Pipeline接口保存到数据库--效率低下
//page.putField("_36wallpaperData",wallpapers);
//循环次数存入redis中
Integer count = redisService.getCacheObject(REPTILE_COUNT);
@ -185,7 +240,14 @@ public class _36wallpaperProcessor implements PageProcessor {
@Override
public Site getSite() {
return site;
return Site.me()
.addHeader(headerKey, headerValue)
.setCharset("utf8")//设置字符编码
.setTimeOut(2000)//设置超时时间
.setRetrySleepTime(100)//设置重试间隔时间
.setCycleRetryTimes(10)//设置重试次数
.setSleepTime(1)//设置两个页面之间的间隔时间
;
}
/**
@ -194,12 +256,18 @@ public class _36wallpaperProcessor implements PageProcessor {
* @return
*/
public Long run() {
Spider.create(new _36wallpaperProcessor()).addUrl(_36_WALLPAPER_URL).thread(20)
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))
.addPipeline(wallpaperPipeline)
.run();
//执行爬虫
Spider.create(new _36wallpaperProcessor())
.addUrl(_36_WALLPAPER_URL)//设置爬取地址
.thread(30)//设置爬取线程数
.setScheduler(new QueueScheduler()
.setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器
//.addPipeline(wallpaperPipeline)//设置爬取之后的数据操作
.run();//执行
//删除重复数据
int count = wallpaperService.deleteRepeatData();
log.info("36壁纸删除重复数据数" + count);
//从redis中获取循环次数
Integer cache = redisService.getCacheObject(REPTILE_COUNT);

Loading…
Cancel
Save