diff --git a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java index 7290e775..b8e7505c 100644 --- a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java +++ b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/RedisConst.java @@ -44,6 +44,10 @@ public class RedisConst { */ public static final String FORECAST_WEATHER = "weather:forecast"; + /** + * 爬虫记录循环次数常量信息 + */ + public static final String REPTILE_COUNT= "reptile:count"; //-------------------有效时间----------------------- public static final Integer TRAN_DICT_EXPIRE = 1; //小时 diff --git a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java index 5fcaa7f4..558f208f 100644 --- a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java +++ b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java @@ -31,6 +31,9 @@ public class ReptileConst { public static final String BOSS_JOB_URL= "https://www.zhipin.com"; + public static final String _36_WALLPAPER_URL= "https://www.3gbizhi.com/"; + + diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/controller/_36wallpaperController.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/controller/_36wallpaperController.java new file mode 100644 index 00000000..c26464c6 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/controller/_36wallpaperController.java @@ -0,0 +1,33 @@ +package com.xjs._36wallpaper.controller; + +import com.ruoyi.common.core.domain.R; +import com.xjs._36wallpaper.task._36wallpaperTask; +import io.swagger.annotations.Api; +import io.swagger.annotations.ApiOperation; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RestController; + +/** + * 36壁纸网爬虫controller + * @author xiejs + * @since 2022-02-20 + */ +@RestController +@RequestMapping("_36wallpaper") +@Api(tags = "爬虫模块-36壁纸网") +public class _36wallpaperController { + + @Autowired + private _36wallpaperTask wallpaperTask; + + + //----------------------远程rpc调用--------------------------- + @GetMapping("taskForPRC") + @ApiOperation("供定时任务服务RPC远程调用") + public R _36wallpaperControllerTaskForPRC() { + Long count = wallpaperTask.reptileWallpaper(); + return R.ok(count); + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/mapper/_36wallpaperMapper.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/mapper/_36wallpaperMapper.java new file mode 100644 index 00000000..3ec311b4 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/mapper/_36wallpaperMapper.java @@ -0,0 +1,18 @@ +package com.xjs._36wallpaper.mapper; + +import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import com.xjs._36wallpaper.pojo._36wallpaper; + +/** + * 36壁纸网mapper + * @author xiejs + * @since 2022-02-20 + */ +public interface _36wallpaperMapper extends BaseMapper<_36wallpaper> { + + /** + * 删除重复数据 + * @return int + */ + int deleteRepeatData(); +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/pojo/_36wallpaper.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/pojo/_36wallpaper.java new file mode 100644 index 00000000..abd95259 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/pojo/_36wallpaper.java @@ -0,0 +1,48 @@ +package com.xjs._36wallpaper.pojo; + +import com.baomidou.mybatisplus.annotation.FieldFill; +import com.baomidou.mybatisplus.annotation.TableField; +import com.baomidou.mybatisplus.annotation.TableId; +import com.baomidou.mybatisplus.annotation.TableName; +import com.ruoyi.common.core.annotation.Excel; +import lombok.Data; + +import java.io.Serializable; +import java.util.Date; + +/** + * 36壁纸网图片数据实体类 + * @author xiejs + * @since 2022-02-20 + */ +@Data +@TableName("webmagic_36wallpaper") +public class _36wallpaper implements Serializable { + + private static final long serialVersionUID = 1L; + + /** 主键 */ + @TableId("id") + private Long id; + + /** 照片存放地址 */ + @Excel(name = "照片存放地址") + private String pictureUrl; + + /** 照片名称 */ + @Excel(name = "照片名称") + private String pictureName; + + /** 照片类型 */ + @Excel(name = "照片类型") + private String type; + + /** 照片标签(多个用 , 分割) */ + @Excel(name = "照片标签(多个用 , 分割)") + private String label; + + /** 创建时间 */ + @Excel(name = "创建时间",dateFormat = "yyyy-MM-dd HH:mm:ss") + @TableField(fill = FieldFill.INSERT) + private Date createTime; +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/service/_36wallpaperService.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/service/_36wallpaperService.java new file mode 100644 index 00000000..b9dbdc67 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/service/_36wallpaperService.java @@ -0,0 +1,18 @@ +package com.xjs._36wallpaper.service; + +import com.baomidou.mybatisplus.extension.service.IService; +import com.xjs._36wallpaper.pojo._36wallpaper; + +/** + * 36壁纸网service接口 + * @author xiejs + * @since 2022-02-20 + */ +public interface _36wallpaperService extends IService<_36wallpaper> { + + /** + * 删除重复数据 + * @return int + */ + int deleteRepeatData(); +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/service/impl/_36wallpaperServiceImpl.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/service/impl/_36wallpaperServiceImpl.java new file mode 100644 index 00000000..c33e3cc5 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/service/impl/_36wallpaperServiceImpl.java @@ -0,0 +1,27 @@ +package com.xjs._36wallpaper.service.impl; + +import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; +import com.xjs._36wallpaper.mapper._36wallpaperMapper; +import com.xjs._36wallpaper.pojo._36wallpaper; +import com.xjs._36wallpaper.service._36wallpaperService; +import org.springframework.stereotype.Service; + +import javax.annotation.Resource; + +/** + * 36壁纸网service实现 + * @author xiejs + * @since 2022-02-20 + */ +@Service +public class _36wallpaperServiceImpl extends ServiceImpl<_36wallpaperMapper, _36wallpaper> implements _36wallpaperService { + + + @Resource + private _36wallpaperMapper wallpaperMapper; + + @Override + public int deleteRepeatData() { + return wallpaperMapper.deleteRepeatData(); + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/task/_36wallpaperTask.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/task/_36wallpaperTask.java new file mode 100644 index 00000000..84fd2174 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/task/_36wallpaperTask.java @@ -0,0 +1,33 @@ +package com.xjs._36wallpaper.task; + +import com.xjs._36wallpaper.webmagic._36wallpaperProcessor; +import com.xjs.annotation.ReptileLog; +import lombok.extern.log4j.Log4j2; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import static com.xjs.consts.ReptileConst._36_WALLPAPER_URL; + +/** + * 36壁纸网爬虫任务 + * @author xiejs + * @since 2022-02-20 + */ +@Component +@Log4j2 +public class _36wallpaperTask { + + @Autowired + private _36wallpaperProcessor wallpaperProcessor; + + + /** + * 提供定时任务调取 + * @return 循环次数 + */ + @ReptileLog(name = "36壁纸网", url = _36_WALLPAPER_URL) + public Long reptileWallpaper() { + return wallpaperProcessor.run(); + } + +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/webmagic/_36wallpaperProcessor.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/webmagic/_36wallpaperProcessor.java index bef3adc9..759377f9 100644 --- a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/webmagic/_36wallpaperProcessor.java +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/_36wallpaper/webmagic/_36wallpaperProcessor.java @@ -1,9 +1,29 @@ package com.xjs._36wallpaper.webmagic; +import com.ruoyi.common.redis.service.RedisService; +import com.xjs._36wallpaper.pojo._36wallpaper; +import com.xjs._36wallpaper.service._36wallpaperService; +import lombok.extern.log4j.Log4j2; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; +import us.codecraft.webmagic.scheduler.QueueScheduler; +import us.codecraft.webmagic.selector.Selectable; + +import java.io.*; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import static com.xjs.consts.RedisConst.REPTILE_COUNT; +import static com.xjs.consts.ReptileConst._36_WALLPAPER_URL; /** * 36壁纸网站 爬虫处理 @@ -11,23 +31,45 @@ import us.codecraft.webmagic.processor.PageProcessor; * @author xiejs * @since 2022-02-19 */ +@Log4j2 +@Component public class _36wallpaperProcessor implements PageProcessor { - private static final String URL = "https://www.3gbizhi.com/"; + /** + * 是否全网爬虫 + */ + private static boolean init = false; + + private static RedisService redisService; - private Site site = Site.me() - .setCharset("uft8")//设置字符编码 - .setTimeOut(10000)//设置超时时间 - .setRetrySleepTime(2000)//设置重试间隔时间 - .setCycleRetryTimes(3)//设置重试次数 - .setSleepTime(10)//设置两个页面之间的间隔时间 - ; - - public void setSite(Site site) { - this.site = site; + @Autowired + public void setRedisService(RedisService redisService) { + _36wallpaperProcessor.redisService = redisService; } + + private static _36wallpaperService wallpaperService; + + @Autowired + public void setWallpaperService(_36wallpaperService wallpaperService) { + _36wallpaperProcessor.wallpaperService = wallpaperService; + } + + /** + * 图片保存到磁盘的路径 + */ + private static final String path = "D:\\Dev\\WebCrawler\\36wallpaper"; + + + private Site site = Site.me() + .setCharset("utf8")//设置字符编码 + .setTimeOut(2000)//设置超时时间 + .setRetrySleepTime(200)//设置重试间隔时间 + .setCycleRetryTimes(6)//设置重试次数 + .setSleepTime(1)//设置两个页面之间的间隔时间 + ; + //解析页面 @Override public void process(Page page) { @@ -44,12 +86,87 @@ public class _36wallpaperProcessor implements PageProcessor { //第三种写法:正则表达式 page.putField("url_regex", page.getHtml().css(".tmenu li a").regex(".*图片*.*").all());*/ - // todo 爬取36壁纸图片 + try { + //获取链接 + page.addTargetRequests(page.getHtml().css(".topmenuc .tmenu li").links().all()); + + //爬取所有分页---后期这两段可以注释 + //当init为true时全网都爬 + if (init) { + page.addTargetRequests(page.getHtml().css("#pageNum a").links().all()); + } + + page.addTargetRequests(page.getHtml().css(".contlistw > .cl > li > a").links().all()); + + + ArrayList<_36wallpaper> wallpapers = new ArrayList<>(); + List bodyNote = page.getHtml().css("body").nodes(); + for (Selectable body : bodyNote) { + _36wallpaper wallpaper = new _36wallpaper(); + + //爬取图片分类 + String titleHtml = body.css(".catpos a:nth-child(3)").get(); + + String title = "null"; + if (titleHtml != null) { + Document titleDom = Jsoup.parse(titleHtml); + title = titleDom.text(); + wallpaper.setType(title); + } - //获取链接 - page.addTargetRequests(page.getHtml().css(".tmenu li").links().all()); - page.putField("url", page.getHtml().css(".imgw").links().all()); + //爬取图片名称 + String html = body.css(".showtitle h2").get(); + String pictureName = "null"; + if (html != null) { + pictureName = Jsoup.parse(html).text(); + wallpaper.setPictureName(pictureName); + } + + + //爬取图片路径 + String link = body.css(".morew").links().get(); + wallpaper.setPictureUrl(link); + + + //保存到磁盘 + if (link != null) { + String thisPath = path + File.separator + title; + downloadPicture(link, thisPath, pictureName + ".jpg"); + } + + //爬取图片标签 + List tagList = body.css(".showcontw > .showtaglistw a").all(); + StringBuilder bf = new StringBuilder(); + + for (int i = 0; i < tagList.size(); i++) { + String text = Jsoup.parse(tagList.get(i)).text(); + if (i != tagList.size() - 1) { + bf.append(text + ","); + } else { + bf.append(text); + } + } + + wallpaper.setLabel(bf.toString()); + + if (wallpaper.getPictureUrl() != null) { + wallpapers.add(wallpaper); + } + } + + //持久化 + wallpaperService.saveBatch(wallpapers, 25); + + //循环次数存入redis中 + Integer count = redisService.getCacheObject(REPTILE_COUNT); + redisService.setCacheObject(REPTILE_COUNT, count+1); + + } catch (Exception e) { + e.printStackTrace(); + } finally { + redisService.expire(REPTILE_COUNT, 1, TimeUnit.HOURS); + } } @@ -61,14 +178,69 @@ public class _36wallpaperProcessor implements PageProcessor { /** * 执行爬虫 + * + * @return 返回循环次数 */ - public void run() { - Spider.create(new _36wallpaperProcessor()).addUrl(URL).thread(5).runAsync(); + public Long run() { + Spider.create(new _36wallpaperProcessor()).addUrl(_36_WALLPAPER_URL).thread(20) + .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(110000))) + .run(); + + //删除重复数据 + int count = wallpaperService.deleteRepeatData(); + log.info("36壁纸删除重复数据数:" + count); + + //从redis中获取循环次数 + Integer cache = redisService.getCacheObject(REPTILE_COUNT); + redisService.deleteObject(REPTILE_COUNT); + + if (cache != null) { + return Long.valueOf(cache); + } + return 0L; + } + + + //链接url下载图片 + private void downloadPicture(String urlList, String path, String fileName) { + java.net.URL url = null; + try { + url = new URL(urlList); + DataInputStream dataInputStream = new DataInputStream(url.openStream()); + + File file = new File(path); + if (!file.exists()) { + file.mkdir(); + } + + //替换\ 防止报错 + if (fileName.contains("/") || fileName.contains("\\")) { + fileName = fileName.replace("/", "-"); + } + + String absolutePath = file.getAbsolutePath() + File.separator + fileName; + + FileOutputStream fileOutputStream = new FileOutputStream(absolutePath); + ByteArrayOutputStream output = new ByteArrayOutputStream(); + + byte[] buffer = new byte[1024]; + int length; + + while ((length = dataInputStream.read(buffer)) > 0) { + output.write(buffer, 0, length); + } + + fileOutputStream.write(output.toByteArray()); + dataInputStream.close(); + fileOutputStream.close(); + } catch (IOException e) { + e.printStackTrace(); + } } public static void main(String[] args) { - Spider.create(new _36wallpaperProcessor()).addUrl("https://www.3gbizhi.com/").thread(5).run(); + Spider.create(new _36wallpaperProcessor()).addUrl(_36_WALLPAPER_URL).thread(15).run(); } } diff --git a/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/_36wallpaperMapper.xml b/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/_36wallpaperMapper.xml new file mode 100644 index 00000000..70247044 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/_36wallpaperMapper.xml @@ -0,0 +1,17 @@ + + + + + + + + delete from webmagic_36wallpaper where id not in ( + SELECT + t.min_id + FROM + ( SELECT min( id ) AS min_id FROM webmagic_36wallpaper GROUP BY picture_url ) AS t + ) + + \ No newline at end of file