From 3a1b4c2153aea182c64033347d5b0e818b2722b0 Mon Sep 17 00:00:00 2001 From: xjs <1294405880@qq.com> Date: Mon, 18 Apr 2022 15:27:21 +0800 Subject: [PATCH] =?UTF-8?q?1=E3=80=81=E4=BC=98=E5=8C=96=E4=B8=AD=E5=85=B3?= =?UTF-8?q?=E6=9D=91=E6=95=B0=E6=8D=AE=E7=88=AC=E8=99=AB=E9=80=BB=E8=BE=91?= =?UTF-8?q?=202=E3=80=81=E6=96=B0=E5=A2=9E=E7=88=AC=E5=8F=96=E4=B8=AD?= =?UTF-8?q?=E5=85=B3=E6=9D=91=E7=AC=94=E8=AE=B0=E6=9C=AC=E7=94=B5=E8=84=91?= =?UTF-8?q?=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/com/xjs/consts/ReptileConst.java | 4 +- .../zol/controller/ZolPhoneController.java | 6 +- .../com/xjs/zol/mapper/ZolNotebookMapper.java | 12 + .../java/com/xjs/zol/pojo/ZolNotebook.java | 73 ++++++ .../xjs/zol/service/ZolNotebookService.java | 12 + .../service/impl/ZolNotebookServiceImpl.java | 16 ++ .../task/{ZolPhoneTask.java => ZolTask.java} | 18 +- .../xjs/zol/webmagic/ZolPhonePipeline.java | 55 ----- .../xjs/zol/webmagic/ZolPhoneProcessor.java | 141 ----------- .../com/xjs/zol/webmagic/ZolPipeline.java | 99 ++++++++ .../com/xjs/zol/webmagic/ZolProcessor.java | 226 ++++++++++++++++++ ...ZolPhoneTaskTest.java => ZolTaskTest.java} | 8 +- 12 files changed, 455 insertions(+), 215 deletions(-) create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/mapper/ZolNotebookMapper.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/pojo/ZolNotebook.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/service/ZolNotebookService.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/service/impl/ZolNotebookServiceImpl.java rename xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/task/{ZolPhoneTask.java => ZolTask.java} (75%) delete mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolPhonePipeline.java delete mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolPhoneProcessor.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolPipeline.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolProcessor.java rename xjs-business/xjs-business-webmagic/src/test/java/com/xjs/zol/task/{ZolPhoneTaskTest.java => ZolTaskTest.java} (70%) diff --git a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java index 02ee2b1a..c0cf2a21 100644 --- a/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java +++ b/xjs-business/xjs-business-common/src/main/java/com/xjs/consts/ReptileConst.java @@ -41,9 +41,9 @@ public class ReptileConst { public static final String WEIXIN_SOUGOU_URL= "https://weixin.sogou.com/"; /** - * 中关村手机报价页面 + * 中关村报价url */ - public static final String ZOL_PHONE_URL= "https://detail.zol.com.cn/cell_phone_index/subcate57_list_1.html"; + public static final String ZOL_PHONE_URL= "https://detail.zol.com.cn"; diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/controller/ZolPhoneController.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/controller/ZolPhoneController.java index 23601391..0d761162 100644 --- a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/controller/ZolPhoneController.java +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/controller/ZolPhoneController.java @@ -1,7 +1,7 @@ package com.xjs.zol.controller; import com.ruoyi.common.core.domain.R; -import com.xjs.zol.task.ZolPhoneTask; +import com.xjs.zol.task.ZolTask; import io.swagger.annotations.Api; import io.swagger.annotations.ApiOperation; import org.springframework.beans.factory.annotation.Autowired; @@ -20,14 +20,14 @@ import org.springframework.web.bind.annotation.RestController; public class ZolPhoneController { @Autowired - private ZolPhoneTask zolPhoneTask; + private ZolTask zolTask; //------------------------------内部调用rpc------------------------------------- @GetMapping("taskForPRC") @ApiOperation("供定时任务服务RPC远程调用") public R ZolPhoneTaskForRPC() { - Long aLong = zolPhoneTask.reptileZolPhone(); + Long aLong = zolTask.reptileZol(); return R.ok(aLong); } } diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/mapper/ZolNotebookMapper.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/mapper/ZolNotebookMapper.java new file mode 100644 index 00000000..83032df5 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/mapper/ZolNotebookMapper.java @@ -0,0 +1,12 @@ +package com.xjs.zol.mapper; + +import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import com.xjs.zol.pojo.ZolNotebook; + +/** + * 爬虫数据中关村笔记本mapper + * @author xiejs + * @since 2022-04-18 + */ +public interface ZolNotebookMapper extends BaseMapper { +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/pojo/ZolNotebook.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/pojo/ZolNotebook.java new file mode 100644 index 00000000..7d9f954d --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/pojo/ZolNotebook.java @@ -0,0 +1,73 @@ +package com.xjs.zol.pojo; + +import com.baomidou.mybatisplus.annotation.FieldFill; +import com.baomidou.mybatisplus.annotation.TableField; +import com.baomidou.mybatisplus.annotation.TableName; +import com.ruoyi.common.core.annotation.Excel; +import com.xjs.entity.BaseEntity; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.Accessors; + +import java.io.Serializable; +import java.math.BigDecimal; +import java.util.Date; + +/** + * 爬虫数据中关村笔记本对象 webmagic_zol_notebook + * + * @author xjs + * @since 2022-04-18 + */ +@Data +@EqualsAndHashCode(callSuper = true) +@Accessors(chain = true) +@TableName("webmagic_zol_notebook") +public class ZolNotebook extends BaseEntity implements Serializable { + private static final long serialVersionUID = 1L; + + /** + * 主键id + */ + private Long id; + + /** + * 笔记本名称 + */ + @Excel(name = "笔记本名称") + private String notebookName; + + /** + * 图片地址 + */ + @Excel(name = "图片地址") + private String pictureUrl; + + /** + * 笔记本描述 + */ + @Excel(name = "笔记本描述") + private String description; + + /** + * 笔记本详情页面 + */ + @Excel(name = "笔记本详情页面") + private String detailPage; + + /** + * 热度 + */ + @Excel(name = "热度") + private BigDecimal heat; + + /** + * 价格 + */ + @Excel(name = "价格") + private String price; + + @Excel(name = "创建时间", dateFormat = "yyyy-MM-dd HH:mm:ss") + @TableField(fill = FieldFill.INSERT) + private Date createTime; +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/service/ZolNotebookService.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/service/ZolNotebookService.java new file mode 100644 index 00000000..337fe257 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/service/ZolNotebookService.java @@ -0,0 +1,12 @@ +package com.xjs.zol.service; + +import com.baomidou.mybatisplus.extension.service.IService; +import com.xjs.zol.pojo.ZolNotebook; + +/** + * 爬虫数据中关村笔记本service接口 + * @author xiejs + * @since 2022-04-18 + */ +public interface ZolNotebookService extends IService { +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/service/impl/ZolNotebookServiceImpl.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/service/impl/ZolNotebookServiceImpl.java new file mode 100644 index 00000000..de0bdd57 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/service/impl/ZolNotebookServiceImpl.java @@ -0,0 +1,16 @@ +package com.xjs.zol.service.impl; + +import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; +import com.xjs.zol.mapper.ZolNotebookMapper; +import com.xjs.zol.pojo.ZolNotebook; +import com.xjs.zol.service.ZolNotebookService; +import org.springframework.stereotype.Service; + +/** + * 爬虫数据中关村笔记本service接口实现 + * @author xiejs + * @since 2022-04-18 + */ +@Service +public class ZolNotebookServiceImpl extends ServiceImpl implements ZolNotebookService { +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/task/ZolPhoneTask.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/task/ZolTask.java similarity index 75% rename from xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/task/ZolPhoneTask.java rename to xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/task/ZolTask.java index c234c97c..f09a04b9 100644 --- a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/task/ZolPhoneTask.java +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/task/ZolTask.java @@ -2,8 +2,8 @@ package com.xjs.zol.task; import com.ruoyi.common.redis.service.RedisService; import com.xjs.annotation.ReptileLog; -import com.xjs.zol.webmagic.ZolPhonePipeline; -import com.xjs.zol.webmagic.ZolPhoneProcessor; +import com.xjs.zol.webmagic.ZolPipeline; +import com.xjs.zol.webmagic.ZolProcessor; import lombok.extern.log4j.Log4j2; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; @@ -22,25 +22,25 @@ import static com.xjs.consts.ReptileConst.ZOL_PHONE_URL; */ @Component @Log4j2 -public class ZolPhoneTask { +public class ZolTask { @Autowired - private ZolPhoneProcessor zolPhoneProcessor; + private ZolProcessor zolProcessor; @Autowired private RedisService redisService; @Autowired - private ZolPhonePipeline zolPhonePipeline; + private ZolPipeline zolPipeline; - @ReptileLog(name = "中关村手机", url = ZOL_PHONE_URL) - public Long reptileZolPhone() { + @ReptileLog(name = "中关村", url = ZOL_PHONE_URL) + public Long reptileZol() { //执行爬虫 - Spider.create(zolPhoneProcessor) + Spider.create(zolProcessor) .addUrl(ZOL_PHONE_URL)//设置爬取地址 .thread(30)//设置爬取线程数 .setScheduler(new QueueScheduler() .setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器 - .addPipeline(zolPhonePipeline)//设置爬取之后的数据操作 + .addPipeline(zolPipeline)//设置爬取之后的数据操作 //.setDownloader(downloader)//设置下载器 .run();//同步执行 diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolPhonePipeline.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolPhonePipeline.java deleted file mode 100644 index 3f341fba..00000000 --- a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolPhonePipeline.java +++ /dev/null @@ -1,55 +0,0 @@ -package com.xjs.zol.webmagic; - -import cn.hutool.core.collection.CollUtil; -import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; -import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper; -import com.xjs.zol.pojo.ZolPhone; -import com.xjs.zol.service.ZolPhoneService; -import lombok.extern.log4j.Log4j2; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Component; -import org.springframework.transaction.annotation.Transactional; -import us.codecraft.webmagic.ResultItems; -import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.pipeline.Pipeline; - -import java.util.List; -import java.util.Objects; - -/** - * 中关村手机页面爬虫数据处理 - * - * @author xiejs - * @since 2022-04-17 - */ -@Component -@Log4j2 -@Transactional -public class ZolPhonePipeline implements Pipeline { - - @Autowired - private ZolPhoneService zolPhoneService; - - @Override - public void process(ResultItems resultItems, Task task) { - List zolPhoneList = resultItems.get("zolPhoneList"); - if (CollUtil.isNotEmpty(zolPhoneList)) { - //循环遍历集合,当对象的名称在数据库为空才插入数据 - for (ZolPhone zolPhone : zolPhoneList) { - ZolPhone dbData = zolPhoneService.getOne(new LambdaQueryWrapper() - .eq(ZolPhone::getPhoneName, zolPhone.getPhoneName()), false); - if (Objects.isNull(dbData)) { - zolPhoneService.save(zolPhone); - } else { - //当前值与数据库热度值不相等的情况下更新数据库 - if (zolPhone.getHeat().compareTo(dbData.getHeat()) != 0) { - zolPhoneService.update(new LambdaUpdateWrapper() - .eq(ZolPhone::getPhoneName, zolPhone.getPhoneName()) - .set(ZolPhone::getHeat, zolPhone.getHeat())); - } - - } - } - } - } -} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolPhoneProcessor.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolPhoneProcessor.java deleted file mode 100644 index b33686a4..00000000 --- a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolPhoneProcessor.java +++ /dev/null @@ -1,141 +0,0 @@ -package com.xjs.zol.webmagic; - -import com.ruoyi.common.core.utils.StringUtils; -import com.ruoyi.common.redis.service.RedisService; -import com.xjs.zol.pojo.ZolPhone; -import lombok.extern.log4j.Log4j2; -import org.apache.commons.lang3.math.NumberUtils; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Component; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.selector.Selectable; - -import java.math.BigDecimal; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.TimeUnit; - -import static com.xjs.consts.RedisConst.REPTILE_ZOL_PHONE_COUNT; - -/** - * 中关村产品报价爬取手机处理 - * - * @author xiejs - * @since 2022-04-17 - */ -@Log4j2 -@Component -public class ZolPhoneProcessor implements PageProcessor { - - public static final String URL = "https://detail.zol.com.cn/"; - - @Autowired - private RedisService redisService; - - @Override - public void process(Page page) { - - try { - Integer count = redisService.getCacheObject(REPTILE_ZOL_PHONE_COUNT); - if (count == null) { - count = 0; - } - //获取其他页面放入队列中 - //等待爬虫的页面后缀 - String html_href = page.getHtml().css(".page-box > .pagebar > .next", "href").get(); - - Thread.sleep(100); - - page.addTargetRequests(Collections.singletonList(html_href)); - - List zolPhoneList = new ArrayList<>(); - - //拿到每个手机的 li 标签 - List lis = page.getHtml().css("#J_PicMode > li").nodes(); - - for (Selectable li : lis) { - ZolPhone zolPhone = new ZolPhone(); - - //排除无用数据 - if ("display:none;".equals(li.css("li", "style").get())) { - continue; - } - - //获取手机的详情页面url - String href = li.css("li > .pic", "href").get(); - - zolPhone.setDetailPage(URL + href); - - //获取手机的名称 - String phoneName = li.css("li > h3 > a", "text").get(); - zolPhone.setPhoneName(phoneName); - - //获取手机的描述 - String desc = li.css("li > h3 > a > span", "text").get(); - zolPhone.setDescription(desc); - - //获取手机的参考价 - String price = li.css("li > .price-row .price-type", "text").get(); - //排除无用数据 - if (StringUtils.isNotBlank(price)) { - //检查是否是数字 - boolean creatable = NumberUtils.isCreatable(price); - if (creatable) { - zolPhone.setPrice(new BigDecimal(price)); - } else { - continue; - } - } else { - continue; - } - - //获取手机的评分 - String heat = li.css("li > .comment-row > .score", "text").get(); - if (StringUtils.isNotBlank(heat)) { - boolean creatable = NumberUtils.isCreatable(price); - if (creatable) { - zolPhone.setHeat(new BigDecimal(heat)); - } else { - continue; - } - } else { - continue; - } - - //获取手机图片的地址 - String picture = li.css("li > .pic > img", ".src").get(); - zolPhone.setPictureUrl(picture); - - zolPhoneList.add(zolPhone); - - //计数 - count++; - } - - page.putField("zolPhoneList", zolPhoneList); - - redisService.setCacheObject(REPTILE_ZOL_PHONE_COUNT, count); - } catch (Exception e) { - log.error(e.getMessage()); - e.printStackTrace(); - } finally { - redisService.expire(REPTILE_ZOL_PHONE_COUNT, 3, TimeUnit.HOURS); - } - - } - - @Override - public Site getSite() { - return Site.me() - //.addHeader(headerKey, headerValue) - .setCharset("GBK")//设置字符编码 - .setTimeOut(2000)//设置超时时间 - .setRetrySleepTime(100)//设置重试间隔时间 - .setCycleRetryTimes(10)//设置重试次数 - .setSleepTime(1)//设置两个页面之间的间隔时间 - ; - } -} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolPipeline.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolPipeline.java new file mode 100644 index 00000000..8d03e861 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolPipeline.java @@ -0,0 +1,99 @@ +package com.xjs.zol.webmagic; + +import cn.hutool.core.collection.CollUtil; +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; +import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper; +import com.xjs.zol.pojo.ZolNotebook; +import com.xjs.zol.pojo.ZolPhone; +import com.xjs.zol.service.ZolNotebookService; +import com.xjs.zol.service.ZolPhoneService; +import lombok.extern.log4j.Log4j2; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import org.springframework.transaction.annotation.Transactional; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.Pipeline; + +import java.util.List; +import java.util.Objects; + +/** + * 中关村手机页面爬虫数据处理 + * + * @author xiejs + * @since 2022-04-17 + */ +@Component +@Log4j2 +@Transactional +public class ZolPipeline implements Pipeline { + + @Autowired + private ZolPhoneService zolPhoneService; + @Autowired + private ZolNotebookService zolNotebookService; + + @Override + public void process(ResultItems resultItems, Task task) { + List zolPhoneList = resultItems.get("zolPhoneList"); + this.saveOrUpdateZolPhoneData(zolPhoneList); + + List zolNotebookList = resultItems.get("zolNotebookList"); + this.saveOrUpdateZolNotebookData(zolNotebookList); + } + + /** + * 更新或保存中关村手机数据 + * + * @param zolPhoneList 手机集合 + */ + private void saveOrUpdateZolPhoneData(List zolPhoneList) { + if (CollUtil.isNotEmpty(zolPhoneList)) { + //循环遍历集合,当对象的名称在数据库为空才插入数据 + for (ZolPhone zolPhone : zolPhoneList) { + ZolPhone dbData = zolPhoneService.getOne(new LambdaQueryWrapper().eq(ZolPhone::getPhoneName, zolPhone.getPhoneName()), false); + if (Objects.isNull(dbData)) { + zolPhoneService.save(zolPhone); + } else { + //当前值与数据库热度值不相等的情况下更新数据库 + if (zolPhone.getHeat().compareTo(dbData.getHeat()) != 0) { + zolPhoneService.update(new LambdaUpdateWrapper().eq(ZolPhone::getPhoneName, zolPhone.getPhoneName()).set(ZolPhone::getHeat, zolPhone.getHeat())); + } + + //更新参考价 + if (zolPhone.getPrice().compareTo(dbData.getPrice()) != 0) { + zolPhoneService.update(new LambdaUpdateWrapper().eq(ZolPhone::getPhoneName, zolPhone.getPhoneName()).set(ZolPhone::getPrice, zolPhone.getPrice())); + } + } + } + } + } + + /** + * 更新或保存中关村笔记本数据 + * + * @param zolNotebookList 笔记本集合 + */ + private void saveOrUpdateZolNotebookData(List zolNotebookList) { + if (CollUtil.isNotEmpty(zolNotebookList)) { + //循环遍历集合,当对象的名称在数据库为空才插入数据 + for (ZolNotebook zolNotebook : zolNotebookList) { + ZolNotebook dbData = zolNotebookService.getOne(new LambdaQueryWrapper().eq(ZolNotebook::getNotebookName, zolNotebook.getNotebookName()), false); + if (Objects.isNull(dbData)) { + zolNotebookService.save(zolNotebook); + } else { + //当前值与数据库热度值不相等的情况下更新数据库 + if (zolNotebook.getHeat().compareTo(dbData.getHeat()) != 0) { + zolNotebookService.update(new LambdaUpdateWrapper().eq(ZolNotebook::getNotebookName, zolNotebook.getNotebookName()).set(ZolNotebook::getHeat, zolNotebook.getHeat())); + } + + //更新参考价 + if (!zolNotebook.getPrice().equals(dbData.getPrice())) { + zolNotebookService.update(new LambdaUpdateWrapper().eq(ZolNotebook::getNotebookName, zolNotebook.getNotebookName()).set(ZolNotebook::getPrice, zolNotebook.getPrice())); + } + } + } + } + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolProcessor.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolProcessor.java new file mode 100644 index 00000000..606621a7 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/zol/webmagic/ZolProcessor.java @@ -0,0 +1,226 @@ +package com.xjs.zol.webmagic; + +import com.ruoyi.common.core.utils.StringUtils; +import com.ruoyi.common.redis.service.RedisService; +import com.xjs.zol.pojo.ZolNotebook; +import com.xjs.zol.pojo.ZolPhone; +import lombok.extern.log4j.Log4j2; +import org.apache.commons.lang3.math.NumberUtils; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Selectable; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import static com.xjs.consts.RedisConst.REPTILE_ZOL_PHONE_COUNT; + +/** + * 中关村产品报价爬取处理 + * + * @author xiejs + * @since 2022-04-17 + */ +@Log4j2 +@Component +@SuppressWarnings("all") +public class ZolProcessor implements PageProcessor { + + public static final String URL = "https://detail.zol.com.cn/"; + + @Autowired + private RedisService redisService; + + @Override + public void process(Page page) { + + try { + Integer count = redisService.getCacheObject(REPTILE_ZOL_PHONE_COUNT); + if (count == null) { + count = 0; + } + + List nodes = page.getHtml().css("#J_CategoryItems > .item > h3 > a").nodes(); + for (Selectable node : nodes) { + + if ("手机".equals(node.css("a", "text").get())) { + String href = node.css("a", "href").get(); + page.addTargetRequests(Collections.singletonList(href)); + } + if ("笔记本".equals(node.css("a", "text").get())) { + String href = node.css("a", "href").get(); + page.addTargetRequests(Collections.singletonList(href)); + } + + } + + //获取其他页面放入队列中 + //等待爬虫的页面后缀 + String html_href = page.getHtml().css(".page-box > .pagebar > .next", "href").get(); + + Thread.sleep(1); + + page.addTargetRequests(Collections.singletonList(html_href)); + + //获取当前页面是手机还是电脑 + String title = page.getHtml().css(".wrapper > .breadcrumb-filter-selected > .breadcrumb > span", "text").get(); + + //拿到每个li 标签 + List lis = page.getHtml().css("#J_PicMode > li").nodes(); + if (StringUtils.isNotEmpty(title) && title.contains("手机")) { + + List zolPhoneList = new ArrayList<>(); + + for (Selectable li : lis) { + ZolPhone zolPhone = new ZolPhone(); + + //排除无用数据 + if ("display:none;".equals(li.css("li", "style").get())) { + continue; + } + + //获取手机的详情页面url + String href = li.css("li > .pic", "href").get(); + + zolPhone.setDetailPage(URL + href); + + //获取手机的名称 + String phoneName = li.css("li > h3 > a", "text").get(); + zolPhone.setPhoneName(phoneName); + + //获取手机的描述 + String desc = li.css("li > h3 > a > span", "text").get(); + zolPhone.setDescription(desc); + + //获取手机的参考价 + String price = li.css("li > .price-row .price-type", "text").get(); + //排除无用数据 + if (StringUtils.isNotBlank(price)) { + //检查是否是数字 + boolean creatable = NumberUtils.isCreatable(price); + if (creatable) { + zolPhone.setPrice(new BigDecimal(price)); + } else { + continue; + } + } else { + continue; + } + + //获取手机的评分 + String heat = li.css("li > .comment-row > .score", "text").get(); + if (StringUtils.isNotBlank(heat)) { + boolean creatable = NumberUtils.isCreatable(heat); + if (creatable) { + zolPhone.setHeat(new BigDecimal(heat)); + } else { + continue; + } + } else { + continue; + } + + //获取手机图片的地址 + String picture = li.css("li > .pic > img", ".src").get(); + zolPhone.setPictureUrl(picture); + + zolPhoneList.add(zolPhone); + + //计数 + count++; + } + + page.putField("zolPhoneList", zolPhoneList); + + } + + if (StringUtils.isNotEmpty(title) && title.contains("笔记本")) { + ArrayList zolNotebooks = new ArrayList<>(); + + for (Selectable li : lis) { + ZolNotebook zolNotebook = new ZolNotebook(); + + //排除无用数据 + if ("display:none;".equals(li.css("li", "style").get())) { + continue; + } + + //获取笔记本的详情页面url + String href = li.css("li > .pic", "href").get(); + zolNotebook.setDetailPage(URL + href); + + //获取笔记本的名称 + String notebookName = li.css("li > h3 > .title-black > a", "text").get(); + zolNotebook.setNotebookName(notebookName); + + //获取笔记本的描述 + String desc = li.css("li > h3 > a", "text").get(); + zolNotebook.setDescription(desc); + + //获取笔记本的参考价 + String price = li.css("li > .price-row .price-type", "text").get(); + //排除无用数据 + if (StringUtils.isNotBlank(price)) { + if ("停产".equals(price) + || "概念产品".equals(price) + || "即将上市".equals(price) + || "暂无报价".equals(price)) { + continue; + } + zolNotebook.setPrice(price); + } else { + continue; + } + + //获取笔记本的评分 + String heat = li.css("li > .comment-row > .score", "text").get(); + if (StringUtils.isNotBlank(heat)) { + boolean creatable = NumberUtils.isCreatable(heat); + if (creatable) { + zolNotebook.setHeat(new BigDecimal(heat)); + } else { + continue; + } + } else { + continue; + } + + //获取笔记本图片的地址 + String picture = li.css("li > .pic > img", ".src").get(); + zolNotebook.setPictureUrl(picture); + + zolNotebooks.add(zolNotebook); + + count++; + } + page.putField("zolNotebookList", zolNotebooks); + } + + redisService.setCacheObject(REPTILE_ZOL_PHONE_COUNT, count); + } catch (Exception e) { + log.error(e.getMessage()); + e.printStackTrace(); + } finally { + redisService.expire(REPTILE_ZOL_PHONE_COUNT, 3, TimeUnit.HOURS); + } + + } + + @Override + public Site getSite() { + return Site.me() + //.addHeader(headerKey, headerValue) + .setCharset("GBK")//设置字符编码 + .setTimeOut(3000)//设置超时时间 + .setRetrySleepTime(100)//设置重试间隔时间 + .setCycleRetryTimes(10)//设置重试次数 + .setSleepTime(1)//设置两个页面之间的间隔时间 + ; + } +} diff --git a/xjs-business/xjs-business-webmagic/src/test/java/com/xjs/zol/task/ZolPhoneTaskTest.java b/xjs-business/xjs-business-webmagic/src/test/java/com/xjs/zol/task/ZolTaskTest.java similarity index 70% rename from xjs-business/xjs-business-webmagic/src/test/java/com/xjs/zol/task/ZolPhoneTaskTest.java rename to xjs-business/xjs-business-webmagic/src/test/java/com/xjs/zol/task/ZolTaskTest.java index 82c378d3..6dcf05b0 100644 --- a/xjs-business/xjs-business-webmagic/src/test/java/com/xjs/zol/task/ZolPhoneTaskTest.java +++ b/xjs-business/xjs-business-webmagic/src/test/java/com/xjs/zol/task/ZolTaskTest.java @@ -5,21 +5,19 @@ import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; -import static org.junit.jupiter.api.Assertions.*; - /** * @author xiejs * @since 2022-04-17 */ @SpringBootTest(classes = XjsWebmagicApp.class) -class ZolPhoneTaskTest { +class ZolTaskTest { @Autowired - private ZolPhoneTask zolPhoneTask; + private ZolTask zolTask; @Test void reptileZolPhone() { - Long aLong = zolPhoneTask.reptileZolPhone(); + Long aLong = zolTask.reptileZol(); System.out.println(aLong); } }