1、爬虫中关村手机所有页面数据爬取并保存到数据库实现

pull/254/head
xjs 3 years ago
parent 1a5624533f
commit f22a31a333

@ -1,6 +1,8 @@
package com.xjs.zol.webmagic; package com.xjs.zol.webmagic;
import cn.hutool.core.collection.CollUtil;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper;
import com.xjs.zol.pojo.ZolPhone; import com.xjs.zol.pojo.ZolPhone;
import com.xjs.zol.service.ZolPhoneService; import com.xjs.zol.service.ZolPhoneService;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
@ -31,12 +33,22 @@ public class ZolPhonePipeline implements Pipeline {
@Override @Override
public void process(ResultItems resultItems, Task task) { public void process(ResultItems resultItems, Task task) {
List<ZolPhone> zolPhoneList = resultItems.get("zolPhoneList"); List<ZolPhone> zolPhoneList = resultItems.get("zolPhoneList");
//循环遍历集合,当对象的名称在数据库为空才插入数据 if (CollUtil.isNotEmpty(zolPhoneList)) {
for (ZolPhone zolPhone : zolPhoneList) { //循环遍历集合,当对象的名称在数据库为空才插入数据
ZolPhone dbData = zolPhoneService.getOne(new LambdaQueryWrapper<ZolPhone>() for (ZolPhone zolPhone : zolPhoneList) {
.eq(ZolPhone::getPhoneName, zolPhone.getPhoneName()), false); ZolPhone dbData = zolPhoneService.getOne(new LambdaQueryWrapper<ZolPhone>()
if (Objects.isNull(dbData)) { .eq(ZolPhone::getPhoneName, zolPhone.getPhoneName()), false);
zolPhoneService.save(zolPhone); if (Objects.isNull(dbData)) {
zolPhoneService.save(zolPhone);
} else {
//当前值与数据库热度值不相等的情况下更新数据库
if (zolPhone.getHeat().compareTo(dbData.getHeat()) != 0) {
zolPhoneService.update(new LambdaUpdateWrapper<ZolPhone>()
.eq(ZolPhone::getPhoneName, zolPhone.getPhoneName())
.set(ZolPhone::getHeat, zolPhone.getHeat()));
}
}
} }
} }
} }

@ -1,8 +1,10 @@
package com.xjs.zol.webmagic; package com.xjs.zol.webmagic;
import com.ruoyi.common.core.utils.StringUtils;
import com.ruoyi.common.redis.service.RedisService; import com.ruoyi.common.redis.service.RedisService;
import com.xjs.zol.pojo.ZolPhone; import com.xjs.zol.pojo.ZolPhone;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.math.NumberUtils;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
@ -12,6 +14,7 @@ import us.codecraft.webmagic.selector.Selectable;
import java.math.BigDecimal; import java.math.BigDecimal;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@ -27,6 +30,8 @@ import static com.xjs.consts.RedisConst.REPTILE_ZOL_PHONE_COUNT;
@Component @Component
public class ZolPhoneProcessor implements PageProcessor { public class ZolPhoneProcessor implements PageProcessor {
public static final String URL = "https://detail.zol.com.cn/";
@Autowired @Autowired
private RedisService redisService; private RedisService redisService;
@ -38,6 +43,13 @@ public class ZolPhoneProcessor implements PageProcessor {
if (count == null) { if (count == null) {
count = 0; count = 0;
} }
//获取其他页面放入队列中
//等待爬虫的页面后缀
String html_href = page.getHtml().css(".page-box > .pagebar > .next", "href").get();
Thread.sleep(100);
page.addTargetRequests(Collections.singletonList(html_href));
List<ZolPhone> zolPhoneList = new ArrayList<>(); List<ZolPhone> zolPhoneList = new ArrayList<>();
@ -55,7 +67,7 @@ public class ZolPhoneProcessor implements PageProcessor {
//获取手机的详情页面url //获取手机的详情页面url
String href = li.css("li > .pic", "href").get(); String href = li.css("li > .pic", "href").get();
zolPhone.setDetailPage("https://detail.zol.com.cn/" + href); zolPhone.setDetailPage(URL + href);
//获取手机的名称 //获取手机的名称
String phoneName = li.css("li > h3 > a", "text").get(); String phoneName = li.css("li > h3 > a", "text").get();
@ -68,14 +80,30 @@ public class ZolPhoneProcessor implements PageProcessor {
//获取手机的参考价 //获取手机的参考价
String price = li.css("li > .price-row .price-type", "text").get(); String price = li.css("li > .price-row .price-type", "text").get();
//排除无用数据 //排除无用数据
if ("概念产品".equals(price)) { if (StringUtils.isNotBlank(price)) {
//检查是否是数字
boolean creatable = NumberUtils.isCreatable(price);
if (creatable) {
zolPhone.setPrice(new BigDecimal(price));
} else {
continue;
}
} else {
continue; continue;
} }
zolPhone.setPrice(new BigDecimal(price));
//获取手机的评分 //获取手机的评分
String heat = li.css("li > .comment-row > .score", "text").get(); String heat = li.css("li > .comment-row > .score", "text").get();
zolPhone.setHeat(new BigDecimal(heat)); if (StringUtils.isNotBlank(heat)) {
boolean creatable = NumberUtils.isCreatable(price);
if (creatable) {
zolPhone.setHeat(new BigDecimal(heat));
} else {
continue;
}
} else {
continue;
}
//获取手机图片的地址 //获取手机图片的地址
String picture = li.css("li > .pic > img", ".src").get(); String picture = li.css("li > .pic > img", ".src").get();
@ -87,7 +115,7 @@ public class ZolPhoneProcessor implements PageProcessor {
count++; count++;
} }
page.putField("zolPhoneList",zolPhoneList); page.putField("zolPhoneList", zolPhoneList);
redisService.setCacheObject(REPTILE_ZOL_PHONE_COUNT, count); redisService.setCacheObject(REPTILE_ZOL_PHONE_COUNT, count);
} catch (Exception e) { } catch (Exception e) {

Loading…
Cancel
Save