parent
247fe674cf
commit
1b0625318c
@ -0,0 +1,21 @@
|
||||
package com.xjs.business.webmagic;
|
||||
|
||||
import com.ruoyi.common.core.constant.ServiceNameConstants;
|
||||
import com.ruoyi.common.core.domain.R;
|
||||
import com.xjs.business.webmagic.factory.RemoteWebmagicSinaFactory;
|
||||
import org.springframework.cloud.openfeign.FeignClient;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
|
||||
/**
|
||||
* 内部 调用 新浪 爬虫定时任务
|
||||
* @author xiejs
|
||||
* @since 2022-02-15
|
||||
*/
|
||||
@FeignClient(contextId = "remoteWebmagicSinaFeign",
|
||||
value = ServiceNameConstants.BUSINESS_WEBMAGIC_SERVICE,
|
||||
fallbackFactory = RemoteWebmagicSinaFactory.class)
|
||||
public interface RemoteWebmagicSinaFeign {
|
||||
|
||||
@GetMapping("/sina/taskForPRC")
|
||||
R sinaTaskForPRC();
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
package com.xjs.business.webmagic.factory;
|
||||
|
||||
import com.ruoyi.common.core.domain.R;
|
||||
import com.xjs.business.webmagic.RemoteWebmagicSinaFeign;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.springframework.cloud.openfeign.FallbackFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
/**
|
||||
* 内部 调用 新浪 爬虫定时任务 降级
|
||||
* @author xiejs
|
||||
* @since 2022-02-15
|
||||
*/
|
||||
@Component
|
||||
@Log4j2
|
||||
public class RemoteWebmagicSinaFactory implements FallbackFactory<RemoteWebmagicSinaFeign> {
|
||||
@Override
|
||||
public RemoteWebmagicSinaFeign create(Throwable cause) {
|
||||
|
||||
return new RemoteWebmagicSinaFeign() {
|
||||
@Override
|
||||
public R sinaTaskForPRC() {
|
||||
log.error("新浪 爬虫定时任务 降级");
|
||||
return R.fail("降级处理");
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
package com.xjs.job.task.webmagic;
|
||||
|
||||
import cn.hutool.core.date.DateUtil;
|
||||
import com.ruoyi.common.core.domain.R;
|
||||
import com.xjs.business.webmagic.RemoteWebmagicSinaFeign;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
|
||||
/**
|
||||
* 爬虫 新浪新闻 定时任务
|
||||
* @author xiejs
|
||||
* @since 2022-02-15
|
||||
*/
|
||||
@Component("SinaTask")
|
||||
@Log4j2
|
||||
public class SinaTask {
|
||||
@Resource
|
||||
private RemoteWebmagicSinaFeign remoteWebmagicSinaFeign;
|
||||
|
||||
/**
|
||||
* 任务执行
|
||||
*/
|
||||
public void sinaNews() {
|
||||
log.info("---------------爬虫-新浪新闻定时任务Start-------------------");
|
||||
LocalDateTime localDateTime1 = DateUtil.date().toLocalDateTime();
|
||||
|
||||
R r = remoteWebmagicSinaFeign.sinaTaskForPRC();
|
||||
|
||||
log.info("爬虫-新浪新闻定时任务结果:code={},msg={},data={}",r.getCode(),r.getMsg(),r.getData());
|
||||
LocalDateTime localDateTime2 = DateUtil.date().toLocalDateTime();
|
||||
long between = ChronoUnit.MILLIS.between(localDateTime1, localDateTime2);
|
||||
log.info("爬虫-新浪新闻定时任务Job耗费时间:{}ms", between);
|
||||
log.info("---------------爬虫-新浪新闻定时任务end---------------------");
|
||||
}
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
package com.xjs.handler;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
/**
|
||||
* 米哈游社区爬取资源
|
||||
* @author xiejs
|
||||
* @since 2022-01-24
|
||||
*/
|
||||
public class MiHoYoRepoPageProcessor implements PageProcessor {
|
||||
|
||||
|
||||
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
|
||||
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return site;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
package com.xjs.sina.controller;
|
||||
|
||||
import com.ruoyi.common.core.domain.R;
|
||||
import com.xjs.sina.task.SinaTask;
|
||||
import io.swagger.annotations.Api;
|
||||
import io.swagger.annotations.ApiOperation;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
/**
|
||||
* 新浪爬虫数据controller
|
||||
* @author xiejs
|
||||
* @since 2022-02-15
|
||||
*/
|
||||
@RestController
|
||||
@RequestMapping("sina")
|
||||
@Api(tags = "爬虫模块-新浪新闻")
|
||||
public class SinaNewsController {
|
||||
@Autowired
|
||||
private SinaTask sinaTask;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//----------------------远程rpc调用---------------------------
|
||||
@GetMapping("taskForPRC")
|
||||
@ApiOperation("供定时任务服务RPC远程调用")
|
||||
public R sinaTaskForPRC() {
|
||||
sinaTask.reptileSinaNews();
|
||||
return R.ok();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,17 @@
|
||||
package com.xjs.sina.mapper;
|
||||
|
||||
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
|
||||
import com.xjs.sina.pojo.SinaNews;
|
||||
|
||||
/**
|
||||
* @author xiejs
|
||||
* @since 2022-02-15
|
||||
*/
|
||||
public interface SinaNewsMapper extends BaseMapper<SinaNews> {
|
||||
|
||||
/**
|
||||
* 删除重复数据
|
||||
* @return int
|
||||
*/
|
||||
int deleteRepeatData();
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
package com.xjs.sina.pojo;
|
||||
|
||||
import com.baomidou.mybatisplus.annotation.TableId;
|
||||
import com.baomidou.mybatisplus.annotation.TableName;
|
||||
import lombok.Data;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* @author xiejs
|
||||
* @since 2022-02-15
|
||||
*/
|
||||
@TableName("webmagic_sina_news")
|
||||
@Data
|
||||
public class SinaNews implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@TableId
|
||||
private Long id;
|
||||
|
||||
/**
|
||||
* 新闻标题
|
||||
*/
|
||||
private String title;
|
||||
|
||||
/**
|
||||
* 新闻分类
|
||||
*/
|
||||
private String category;
|
||||
|
||||
/**
|
||||
* 新闻地址
|
||||
*/
|
||||
private String url;
|
||||
|
||||
private Date createTime;
|
||||
}
|
@ -0,0 +1,17 @@
|
||||
package com.xjs.sina.service;
|
||||
|
||||
import com.baomidou.mybatisplus.extension.service.IService;
|
||||
import com.xjs.sina.pojo.SinaNews;
|
||||
|
||||
/**
|
||||
* 新浪新闻爬虫Service接口
|
||||
* @author xiejs
|
||||
* @since 2022-02-15
|
||||
*/
|
||||
public interface SinaNewsService extends IService<SinaNews> {
|
||||
/**
|
||||
* 删除重复数据
|
||||
* @return int
|
||||
*/
|
||||
int deleteRepeatData();
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
package com.xjs.sina.service.impl;
|
||||
|
||||
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
|
||||
import com.xjs.sina.mapper.SinaNewsMapper;
|
||||
import com.xjs.sina.pojo.SinaNews;
|
||||
import com.xjs.sina.service.SinaNewsService;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
|
||||
/**
|
||||
* 新浪新闻爬虫Service接口实现
|
||||
* @author xiejs
|
||||
* @since 2022-02-15
|
||||
*/
|
||||
@Service
|
||||
public class SinaNewsServiceImpl extends ServiceImpl<SinaNewsMapper, SinaNews> implements SinaNewsService {
|
||||
@Resource
|
||||
private SinaNewsMapper sinaNewsMapper;
|
||||
|
||||
@Override
|
||||
public int deleteRepeatData() {
|
||||
return sinaNewsMapper.deleteRepeatData();
|
||||
}
|
||||
}
|
@ -0,0 +1,176 @@
|
||||
package com.xjs.sina.task;
|
||||
|
||||
import cn.hutool.core.collection.CollUtil;
|
||||
import com.ruoyi.common.core.utils.StringUtils;
|
||||
import com.xjs.common.util.HttpUtils;
|
||||
import com.xjs.sina.pojo.SinaNews;
|
||||
import com.xjs.sina.service.SinaNewsService;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* 新浪新闻爬虫任务
|
||||
* @author xiejs
|
||||
* @since 2022-02-15
|
||||
*/
|
||||
@Component
|
||||
@Log4j2
|
||||
public class SinaTask {
|
||||
|
||||
@Autowired
|
||||
private HttpUtils httpUtils;
|
||||
@Autowired
|
||||
private SinaNewsService sinaNewsService;
|
||||
|
||||
public void reptileSinaNews() {
|
||||
try {
|
||||
String url = "https://news.sina.com.cn/";
|
||||
|
||||
String html = httpUtils.doGetHtml(url);
|
||||
|
||||
Document document = Jsoup.parse(html);
|
||||
|
||||
this.parse(document);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析dom
|
||||
*
|
||||
* @param document dom
|
||||
*/
|
||||
private void parse(Document document) {
|
||||
try {
|
||||
//获取子链接
|
||||
Elements nav_mod_1 = document.getElementsByClass("nav-mod-1");
|
||||
Elements link = nav_mod_1.select("ul > li > a");
|
||||
List<Map<String, String>> hrefList = link.stream().map(a -> {
|
||||
String href = a.attr("href");
|
||||
String text = a.text();
|
||||
Map<String, String> map = new HashMap<>();
|
||||
map.put(text, href);
|
||||
return map;
|
||||
}).collect(Collectors.toList());
|
||||
hrefList.removeIf(s -> s.containsKey("javascript:;"));
|
||||
|
||||
for (Map<String, String> map : hrefList) {
|
||||
Set<Map.Entry<String, String>> entrySet = map.entrySet();
|
||||
for (Map.Entry<String, String> entry : entrySet) {
|
||||
String html = httpUtils.doGetHtml(entry.getValue());
|
||||
Document docChild = Jsoup.parse(html);
|
||||
this.parseChile(docChild, entry.getKey());
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析子dom
|
||||
*
|
||||
* @param docChild 子
|
||||
* @param key key
|
||||
*/
|
||||
private void parseChile(Document docChild, String key) {
|
||||
try {
|
||||
Elements a = docChild.getElementsByTag("a");
|
||||
ArrayList<String> link = new ArrayList<>();
|
||||
for (Element element : a) {
|
||||
String href = element.attr("href");
|
||||
if (href.contains(".html") || href.contains(".shtml")) {
|
||||
link.add(href);
|
||||
}
|
||||
}
|
||||
|
||||
ArrayList<SinaNews> sinaNewsList = new ArrayList<>();
|
||||
|
||||
//遍历每个文章页面,然后持久化到数据库
|
||||
for (String url : link) {
|
||||
//url不包含yyyy-dd- 直接跳过
|
||||
if (!url.contains("-")) {
|
||||
continue;
|
||||
}
|
||||
String html = httpUtils.doGetHtml(url);
|
||||
Document document = Jsoup.parse(html);
|
||||
Elements main_title = document.getElementsByClass("main-title");
|
||||
Elements tit = document.getElementsByClass("tit");
|
||||
Element artibodyTitle = document.getElementById("artibodyTitle");
|
||||
Elements F_yahei = document.getElementsByClass("F-yahei");
|
||||
Elements crt_h1 = document.select(".crticalcontent > h1");
|
||||
Elements crth_h1 = document.select(".article-header > h1");
|
||||
|
||||
|
||||
if (CollUtil.isNotEmpty(main_title)
|
||||
|| CollUtil.isNotEmpty(tit)
|
||||
|| artibodyTitle != null
|
||||
|| CollUtil.isNotEmpty(F_yahei)
|
||||
|| CollUtil.isNotEmpty(crt_h1)
|
||||
|| CollUtil.isNotEmpty(crth_h1)) {
|
||||
String title = null;
|
||||
if (CollUtil.isNotEmpty(main_title)) {
|
||||
title = main_title.text();
|
||||
}
|
||||
if (title == null) {
|
||||
if (CollUtil.isNotEmpty(tit)) {
|
||||
title = tit.text();
|
||||
}
|
||||
}
|
||||
if (title == null) {
|
||||
if (artibodyTitle != null) {
|
||||
title = artibodyTitle.text();
|
||||
}
|
||||
}
|
||||
if (title == null) {
|
||||
if (CollUtil.isNotEmpty(F_yahei)) {
|
||||
title = F_yahei.text();
|
||||
}
|
||||
}
|
||||
if (title == null) {
|
||||
if (CollUtil.isNotEmpty(crt_h1)) {
|
||||
title = crt_h1.text();
|
||||
}
|
||||
}
|
||||
if (title == null) {
|
||||
if (CollUtil.isNotEmpty(crth_h1)) {
|
||||
title = crth_h1.text();
|
||||
}
|
||||
}
|
||||
|
||||
if (StringUtils.isEmpty(title)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
//持久化
|
||||
SinaNews sinaNews = new SinaNews();
|
||||
sinaNews.setCategory(key);
|
||||
sinaNews.setTitle(title);
|
||||
sinaNews.setUrl(url);
|
||||
sinaNews.setCreateTime(new Date());
|
||||
|
||||
sinaNewsList.add(sinaNews);
|
||||
}
|
||||
}
|
||||
sinaNewsService.saveBatch(sinaNewsList, 30);
|
||||
|
||||
//删除重复
|
||||
int count = sinaNewsService.deleteRepeatData();
|
||||
log.info("重复数据为:{}", count);
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!DOCTYPE mapper
|
||||
PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
|
||||
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
|
||||
<mapper namespace="com.xjs.sina.mapper.SinaNewsMapper">
|
||||
|
||||
|
||||
<delete id="deleteRepeatData">
|
||||
DELETE
|
||||
FROM
|
||||
webmagic_sina_news
|
||||
WHERE
|
||||
title IN ( SELECT t.title FROM ( SELECT title FROM webmagic_sina_news GROUP BY title HAVING count( title ) > 1 ) t )
|
||||
AND id NOT IN (
|
||||
SELECT
|
||||
c.id
|
||||
FROM
|
||||
( SELECT min( id ) id FROM webmagic_sina_news GROUP BY title HAVING count( title )> 1 ) c
|
||||
)
|
||||
</delete>
|
||||
</mapper>
|
Loading…
Reference in new issue