1、优化中关村数据爬虫逻辑

2、新增爬取中关村笔记本电脑数据
pull/254/head
xjs 3 years ago
parent f22a31a333
commit 3a1b4c2153

@ -41,9 +41,9 @@ public class ReptileConst {
public static final String WEIXIN_SOUGOU_URL= "https://weixin.sogou.com/"; public static final String WEIXIN_SOUGOU_URL= "https://weixin.sogou.com/";
/** /**
* * url
*/ */
public static final String ZOL_PHONE_URL= "https://detail.zol.com.cn/cell_phone_index/subcate57_list_1.html"; public static final String ZOL_PHONE_URL= "https://detail.zol.com.cn";

@ -1,7 +1,7 @@
package com.xjs.zol.controller; package com.xjs.zol.controller;
import com.ruoyi.common.core.domain.R; import com.ruoyi.common.core.domain.R;
import com.xjs.zol.task.ZolPhoneTask; import com.xjs.zol.task.ZolTask;
import io.swagger.annotations.Api; import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation; import io.swagger.annotations.ApiOperation;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
@ -20,14 +20,14 @@ import org.springframework.web.bind.annotation.RestController;
public class ZolPhoneController { public class ZolPhoneController {
@Autowired @Autowired
private ZolPhoneTask zolPhoneTask; private ZolTask zolTask;
//------------------------------内部调用rpc------------------------------------- //------------------------------内部调用rpc-------------------------------------
@GetMapping("taskForPRC") @GetMapping("taskForPRC")
@ApiOperation("供定时任务服务RPC远程调用") @ApiOperation("供定时任务服务RPC远程调用")
public R<Long> ZolPhoneTaskForRPC() { public R<Long> ZolPhoneTaskForRPC() {
Long aLong = zolPhoneTask.reptileZolPhone(); Long aLong = zolTask.reptileZol();
return R.ok(aLong); return R.ok(aLong);
} }
} }

@ -0,0 +1,12 @@
package com.xjs.zol.mapper;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.xjs.zol.pojo.ZolNotebook;
/**
* mapper
* @author xiejs
* @since 2022-04-18
*/
public interface ZolNotebookMapper extends BaseMapper<ZolNotebook> {
}

@ -0,0 +1,73 @@
package com.xjs.zol.pojo;
import com.baomidou.mybatisplus.annotation.FieldFill;
import com.baomidou.mybatisplus.annotation.TableField;
import com.baomidou.mybatisplus.annotation.TableName;
import com.ruoyi.common.core.annotation.Excel;
import com.xjs.entity.BaseEntity;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.Accessors;
import java.io.Serializable;
import java.math.BigDecimal;
import java.util.Date;
/**
* webmagic_zol_notebook
*
* @author xjs
* @since 2022-04-18
*/
@Data
@EqualsAndHashCode(callSuper = true)
@Accessors(chain = true)
@TableName("webmagic_zol_notebook")
public class ZolNotebook extends BaseEntity implements Serializable {
private static final long serialVersionUID = 1L;
/**
* id
*/
private Long id;
/**
*
*/
@Excel(name = "笔记本名称")
private String notebookName;
/**
*
*/
@Excel(name = "图片地址")
private String pictureUrl;
/**
*
*/
@Excel(name = "笔记本描述")
private String description;
/**
*
*/
@Excel(name = "笔记本详情页面")
private String detailPage;
/**
*
*/
@Excel(name = "热度")
private BigDecimal heat;
/**
*
*/
@Excel(name = "价格")
private String price;
@Excel(name = "创建时间", dateFormat = "yyyy-MM-dd HH:mm:ss")
@TableField(fill = FieldFill.INSERT)
private Date createTime;
}

@ -0,0 +1,12 @@
package com.xjs.zol.service;
import com.baomidou.mybatisplus.extension.service.IService;
import com.xjs.zol.pojo.ZolNotebook;
/**
* service
* @author xiejs
* @since 2022-04-18
*/
public interface ZolNotebookService extends IService<ZolNotebook> {
}

@ -0,0 +1,16 @@
package com.xjs.zol.service.impl;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.xjs.zol.mapper.ZolNotebookMapper;
import com.xjs.zol.pojo.ZolNotebook;
import com.xjs.zol.service.ZolNotebookService;
import org.springframework.stereotype.Service;
/**
* service
* @author xiejs
* @since 2022-04-18
*/
@Service
public class ZolNotebookServiceImpl extends ServiceImpl<ZolNotebookMapper, ZolNotebook> implements ZolNotebookService {
}

@ -2,8 +2,8 @@ package com.xjs.zol.task;
import com.ruoyi.common.redis.service.RedisService; import com.ruoyi.common.redis.service.RedisService;
import com.xjs.annotation.ReptileLog; import com.xjs.annotation.ReptileLog;
import com.xjs.zol.webmagic.ZolPhonePipeline; import com.xjs.zol.webmagic.ZolPipeline;
import com.xjs.zol.webmagic.ZolPhoneProcessor; import com.xjs.zol.webmagic.ZolProcessor;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
@ -22,25 +22,25 @@ import static com.xjs.consts.ReptileConst.ZOL_PHONE_URL;
*/ */
@Component @Component
@Log4j2 @Log4j2
public class ZolPhoneTask { public class ZolTask {
@Autowired @Autowired
private ZolPhoneProcessor zolPhoneProcessor; private ZolProcessor zolProcessor;
@Autowired @Autowired
private RedisService redisService; private RedisService redisService;
@Autowired @Autowired
private ZolPhonePipeline zolPhonePipeline; private ZolPipeline zolPipeline;
@ReptileLog(name = "中关村手机", url = ZOL_PHONE_URL) @ReptileLog(name = "中关村", url = ZOL_PHONE_URL)
public Long reptileZolPhone() { public Long reptileZol() {
//执行爬虫 //执行爬虫
Spider.create(zolPhoneProcessor) Spider.create(zolProcessor)
.addUrl(ZOL_PHONE_URL)//设置爬取地址 .addUrl(ZOL_PHONE_URL)//设置爬取地址
.thread(30)//设置爬取线程数 .thread(30)//设置爬取线程数
.setScheduler(new QueueScheduler() .setScheduler(new QueueScheduler()
.setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器 .setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器
.addPipeline(zolPhonePipeline)//设置爬取之后的数据操作 .addPipeline(zolPipeline)//设置爬取之后的数据操作
//.setDownloader(downloader)//设置下载器 //.setDownloader(downloader)//设置下载器
.run();//同步执行 .run();//同步执行

@ -1,55 +0,0 @@
package com.xjs.zol.webmagic;
import cn.hutool.core.collection.CollUtil;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper;
import com.xjs.zol.pojo.ZolPhone;
import com.xjs.zol.service.ZolPhoneService;
import lombok.extern.log4j.Log4j2;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.List;
import java.util.Objects;
/**
*
*
* @author xiejs
* @since 2022-04-17
*/
@Component
@Log4j2
@Transactional
public class ZolPhonePipeline implements Pipeline {
@Autowired
private ZolPhoneService zolPhoneService;
@Override
public void process(ResultItems resultItems, Task task) {
List<ZolPhone> zolPhoneList = resultItems.get("zolPhoneList");
if (CollUtil.isNotEmpty(zolPhoneList)) {
//循环遍历集合,当对象的名称在数据库为空才插入数据
for (ZolPhone zolPhone : zolPhoneList) {
ZolPhone dbData = zolPhoneService.getOne(new LambdaQueryWrapper<ZolPhone>()
.eq(ZolPhone::getPhoneName, zolPhone.getPhoneName()), false);
if (Objects.isNull(dbData)) {
zolPhoneService.save(zolPhone);
} else {
//当前值与数据库热度值不相等的情况下更新数据库
if (zolPhone.getHeat().compareTo(dbData.getHeat()) != 0) {
zolPhoneService.update(new LambdaUpdateWrapper<ZolPhone>()
.eq(ZolPhone::getPhoneName, zolPhone.getPhoneName())
.set(ZolPhone::getHeat, zolPhone.getHeat()));
}
}
}
}
}
}

@ -1,141 +0,0 @@
package com.xjs.zol.webmagic;
import com.ruoyi.common.core.utils.StringUtils;
import com.ruoyi.common.redis.service.RedisService;
import com.xjs.zol.pojo.ZolPhone;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.math.NumberUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.TimeUnit;
import static com.xjs.consts.RedisConst.REPTILE_ZOL_PHONE_COUNT;
/**
*
*
* @author xiejs
* @since 2022-04-17
*/
@Log4j2
@Component
public class ZolPhoneProcessor implements PageProcessor {
public static final String URL = "https://detail.zol.com.cn/";
@Autowired
private RedisService redisService;
@Override
public void process(Page page) {
try {
Integer count = redisService.getCacheObject(REPTILE_ZOL_PHONE_COUNT);
if (count == null) {
count = 0;
}
//获取其他页面放入队列中
//等待爬虫的页面后缀
String html_href = page.getHtml().css(".page-box > .pagebar > .next", "href").get();
Thread.sleep(100);
page.addTargetRequests(Collections.singletonList(html_href));
List<ZolPhone> zolPhoneList = new ArrayList<>();
//拿到每个手机的 li 标签
List<Selectable> lis = page.getHtml().css("#J_PicMode > li").nodes();
for (Selectable li : lis) {
ZolPhone zolPhone = new ZolPhone();
//排除无用数据
if ("display:none;".equals(li.css("li", "style").get())) {
continue;
}
//获取手机的详情页面url
String href = li.css("li > .pic", "href").get();
zolPhone.setDetailPage(URL + href);
//获取手机的名称
String phoneName = li.css("li > h3 > a", "text").get();
zolPhone.setPhoneName(phoneName);
//获取手机的描述
String desc = li.css("li > h3 > a > span", "text").get();
zolPhone.setDescription(desc);
//获取手机的参考价
String price = li.css("li > .price-row .price-type", "text").get();
//排除无用数据
if (StringUtils.isNotBlank(price)) {
//检查是否是数字
boolean creatable = NumberUtils.isCreatable(price);
if (creatable) {
zolPhone.setPrice(new BigDecimal(price));
} else {
continue;
}
} else {
continue;
}
//获取手机的评分
String heat = li.css("li > .comment-row > .score", "text").get();
if (StringUtils.isNotBlank(heat)) {
boolean creatable = NumberUtils.isCreatable(price);
if (creatable) {
zolPhone.setHeat(new BigDecimal(heat));
} else {
continue;
}
} else {
continue;
}
//获取手机图片的地址
String picture = li.css("li > .pic > img", ".src").get();
zolPhone.setPictureUrl(picture);
zolPhoneList.add(zolPhone);
//计数
count++;
}
page.putField("zolPhoneList", zolPhoneList);
redisService.setCacheObject(REPTILE_ZOL_PHONE_COUNT, count);
} catch (Exception e) {
log.error(e.getMessage());
e.printStackTrace();
} finally {
redisService.expire(REPTILE_ZOL_PHONE_COUNT, 3, TimeUnit.HOURS);
}
}
@Override
public Site getSite() {
return Site.me()
//.addHeader(headerKey, headerValue)
.setCharset("GBK")//设置字符编码
.setTimeOut(2000)//设置超时时间
.setRetrySleepTime(100)//设置重试间隔时间
.setCycleRetryTimes(10)//设置重试次数
.setSleepTime(1)//设置两个页面之间的间隔时间
;
}
}

@ -0,0 +1,99 @@
package com.xjs.zol.webmagic;
import cn.hutool.core.collection.CollUtil;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper;
import com.xjs.zol.pojo.ZolNotebook;
import com.xjs.zol.pojo.ZolPhone;
import com.xjs.zol.service.ZolNotebookService;
import com.xjs.zol.service.ZolPhoneService;
import lombok.extern.log4j.Log4j2;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.List;
import java.util.Objects;
/**
*
*
* @author xiejs
* @since 2022-04-17
*/
@Component
@Log4j2
@Transactional
public class ZolPipeline implements Pipeline {
@Autowired
private ZolPhoneService zolPhoneService;
@Autowired
private ZolNotebookService zolNotebookService;
@Override
public void process(ResultItems resultItems, Task task) {
List<ZolPhone> zolPhoneList = resultItems.get("zolPhoneList");
this.saveOrUpdateZolPhoneData(zolPhoneList);
List<ZolNotebook> zolNotebookList = resultItems.get("zolNotebookList");
this.saveOrUpdateZolNotebookData(zolNotebookList);
}
/**
*
*
* @param zolPhoneList
*/
private void saveOrUpdateZolPhoneData(List<ZolPhone> zolPhoneList) {
if (CollUtil.isNotEmpty(zolPhoneList)) {
//循环遍历集合,当对象的名称在数据库为空才插入数据
for (ZolPhone zolPhone : zolPhoneList) {
ZolPhone dbData = zolPhoneService.getOne(new LambdaQueryWrapper<ZolPhone>().eq(ZolPhone::getPhoneName, zolPhone.getPhoneName()), false);
if (Objects.isNull(dbData)) {
zolPhoneService.save(zolPhone);
} else {
//当前值与数据库热度值不相等的情况下更新数据库
if (zolPhone.getHeat().compareTo(dbData.getHeat()) != 0) {
zolPhoneService.update(new LambdaUpdateWrapper<ZolPhone>().eq(ZolPhone::getPhoneName, zolPhone.getPhoneName()).set(ZolPhone::getHeat, zolPhone.getHeat()));
}
//更新参考价
if (zolPhone.getPrice().compareTo(dbData.getPrice()) != 0) {
zolPhoneService.update(new LambdaUpdateWrapper<ZolPhone>().eq(ZolPhone::getPhoneName, zolPhone.getPhoneName()).set(ZolPhone::getPrice, zolPhone.getPrice()));
}
}
}
}
}
/**
*
*
* @param zolNotebookList
*/
private void saveOrUpdateZolNotebookData(List<ZolNotebook> zolNotebookList) {
if (CollUtil.isNotEmpty(zolNotebookList)) {
//循环遍历集合,当对象的名称在数据库为空才插入数据
for (ZolNotebook zolNotebook : zolNotebookList) {
ZolNotebook dbData = zolNotebookService.getOne(new LambdaQueryWrapper<ZolNotebook>().eq(ZolNotebook::getNotebookName, zolNotebook.getNotebookName()), false);
if (Objects.isNull(dbData)) {
zolNotebookService.save(zolNotebook);
} else {
//当前值与数据库热度值不相等的情况下更新数据库
if (zolNotebook.getHeat().compareTo(dbData.getHeat()) != 0) {
zolNotebookService.update(new LambdaUpdateWrapper<ZolNotebook>().eq(ZolNotebook::getNotebookName, zolNotebook.getNotebookName()).set(ZolNotebook::getHeat, zolNotebook.getHeat()));
}
//更新参考价
if (!zolNotebook.getPrice().equals(dbData.getPrice())) {
zolNotebookService.update(new LambdaUpdateWrapper<ZolNotebook>().eq(ZolNotebook::getNotebookName, zolNotebook.getNotebookName()).set(ZolNotebook::getPrice, zolNotebook.getPrice()));
}
}
}
}
}
}

@ -0,0 +1,226 @@
package com.xjs.zol.webmagic;
import com.ruoyi.common.core.utils.StringUtils;
import com.ruoyi.common.redis.service.RedisService;
import com.xjs.zol.pojo.ZolNotebook;
import com.xjs.zol.pojo.ZolPhone;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.math.NumberUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.TimeUnit;
import static com.xjs.consts.RedisConst.REPTILE_ZOL_PHONE_COUNT;
/**
*
*
* @author xiejs
* @since 2022-04-17
*/
@Log4j2
@Component
@SuppressWarnings("all")
public class ZolProcessor implements PageProcessor {
public static final String URL = "https://detail.zol.com.cn/";
@Autowired
private RedisService redisService;
@Override
public void process(Page page) {
try {
Integer count = redisService.getCacheObject(REPTILE_ZOL_PHONE_COUNT);
if (count == null) {
count = 0;
}
List<Selectable> nodes = page.getHtml().css("#J_CategoryItems > .item > h3 > a").nodes();
for (Selectable node : nodes) {
if ("手机".equals(node.css("a", "text").get())) {
String href = node.css("a", "href").get();
page.addTargetRequests(Collections.singletonList(href));
}
if ("笔记本".equals(node.css("a", "text").get())) {
String href = node.css("a", "href").get();
page.addTargetRequests(Collections.singletonList(href));
}
}
//获取其他页面放入队列中
//等待爬虫的页面后缀
String html_href = page.getHtml().css(".page-box > .pagebar > .next", "href").get();
Thread.sleep(1);
page.addTargetRequests(Collections.singletonList(html_href));
//获取当前页面是手机还是电脑
String title = page.getHtml().css(".wrapper > .breadcrumb-filter-selected > .breadcrumb > span", "text").get();
//拿到每个li 标签
List<Selectable> lis = page.getHtml().css("#J_PicMode > li").nodes();
if (StringUtils.isNotEmpty(title) && title.contains("手机")) {
List<ZolPhone> zolPhoneList = new ArrayList<>();
for (Selectable li : lis) {
ZolPhone zolPhone = new ZolPhone();
//排除无用数据
if ("display:none;".equals(li.css("li", "style").get())) {
continue;
}
//获取手机的详情页面url
String href = li.css("li > .pic", "href").get();
zolPhone.setDetailPage(URL + href);
//获取手机的名称
String phoneName = li.css("li > h3 > a", "text").get();
zolPhone.setPhoneName(phoneName);
//获取手机的描述
String desc = li.css("li > h3 > a > span", "text").get();
zolPhone.setDescription(desc);
//获取手机的参考价
String price = li.css("li > .price-row .price-type", "text").get();
//排除无用数据
if (StringUtils.isNotBlank(price)) {
//检查是否是数字
boolean creatable = NumberUtils.isCreatable(price);
if (creatable) {
zolPhone.setPrice(new BigDecimal(price));
} else {
continue;
}
} else {
continue;
}
//获取手机的评分
String heat = li.css("li > .comment-row > .score", "text").get();
if (StringUtils.isNotBlank(heat)) {
boolean creatable = NumberUtils.isCreatable(heat);
if (creatable) {
zolPhone.setHeat(new BigDecimal(heat));
} else {
continue;
}
} else {
continue;
}
//获取手机图片的地址
String picture = li.css("li > .pic > img", ".src").get();
zolPhone.setPictureUrl(picture);
zolPhoneList.add(zolPhone);
//计数
count++;
}
page.putField("zolPhoneList", zolPhoneList);
}
if (StringUtils.isNotEmpty(title) && title.contains("笔记本")) {
ArrayList<ZolNotebook> zolNotebooks = new ArrayList<>();
for (Selectable li : lis) {
ZolNotebook zolNotebook = new ZolNotebook();
//排除无用数据
if ("display:none;".equals(li.css("li", "style").get())) {
continue;
}
//获取笔记本的详情页面url
String href = li.css("li > .pic", "href").get();
zolNotebook.setDetailPage(URL + href);
//获取笔记本的名称
String notebookName = li.css("li > h3 > .title-black > a", "text").get();
zolNotebook.setNotebookName(notebookName);
//获取笔记本的描述
String desc = li.css("li > h3 > a", "text").get();
zolNotebook.setDescription(desc);
//获取笔记本的参考价
String price = li.css("li > .price-row .price-type", "text").get();
//排除无用数据
if (StringUtils.isNotBlank(price)) {
if ("停产".equals(price)
|| "概念产品".equals(price)
|| "即将上市".equals(price)
|| "暂无报价".equals(price)) {
continue;
}
zolNotebook.setPrice(price);
} else {
continue;
}
//获取笔记本的评分
String heat = li.css("li > .comment-row > .score", "text").get();
if (StringUtils.isNotBlank(heat)) {
boolean creatable = NumberUtils.isCreatable(heat);
if (creatable) {
zolNotebook.setHeat(new BigDecimal(heat));
} else {
continue;
}
} else {
continue;
}
//获取笔记本图片的地址
String picture = li.css("li > .pic > img", ".src").get();
zolNotebook.setPictureUrl(picture);
zolNotebooks.add(zolNotebook);
count++;
}
page.putField("zolNotebookList", zolNotebooks);
}
redisService.setCacheObject(REPTILE_ZOL_PHONE_COUNT, count);
} catch (Exception e) {
log.error(e.getMessage());
e.printStackTrace();
} finally {
redisService.expire(REPTILE_ZOL_PHONE_COUNT, 3, TimeUnit.HOURS);
}
}
@Override
public Site getSite() {
return Site.me()
//.addHeader(headerKey, headerValue)
.setCharset("GBK")//设置字符编码
.setTimeOut(3000)//设置超时时间
.setRetrySleepTime(100)//设置重试间隔时间
.setCycleRetryTimes(10)//设置重试次数
.setSleepTime(1)//设置两个页面之间的间隔时间
;
}
}

@ -5,21 +5,19 @@ import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest; import org.springframework.boot.test.context.SpringBootTest;
import static org.junit.jupiter.api.Assertions.*;
/** /**
* @author xiejs * @author xiejs
* @since 2022-04-17 * @since 2022-04-17
*/ */
@SpringBootTest(classes = XjsWebmagicApp.class) @SpringBootTest(classes = XjsWebmagicApp.class)
class ZolPhoneTaskTest { class ZolTaskTest {
@Autowired @Autowired
private ZolPhoneTask zolPhoneTask; private ZolTask zolTask;
@Test @Test
void reptileZolPhone() { void reptileZolPhone() {
Long aLong = zolPhoneTask.reptileZolPhone(); Long aLong = zolTask.reptileZol();
System.out.println(aLong); System.out.println(aLong);
} }
} }
Loading…
Cancel
Save