1、爬虫服务实现新浪新闻爬虫功能

2、远程定时任务调用新浪爬虫功能
pull/254/head
xjs 4 years ago
parent 247fe674cf
commit 1b0625318c

@ -16,9 +16,9 @@ import org.springframework.web.bind.annotation.RequestBody;
import java.util.List; import java.util.List;
/** /**
* rpccrud
* @author xiejs * @author xiejs
* @desc rpccrud * @since 2021-12-31
* @create 2021-12-31
*/ */
@FeignClient(contextId = "remoteWarningCRUDFeign", @FeignClient(contextId = "remoteWarningCRUDFeign",
value = ServiceNameConstants.BUSINESS_WARNING_SERVICE, value = ServiceNameConstants.BUSINESS_WARNING_SERVICE,

@ -0,0 +1,21 @@
package com.xjs.business.webmagic;
import com.ruoyi.common.core.constant.ServiceNameConstants;
import com.ruoyi.common.core.domain.R;
import com.xjs.business.webmagic.factory.RemoteWebmagicSinaFactory;
import org.springframework.cloud.openfeign.FeignClient;
import org.springframework.web.bind.annotation.GetMapping;
/**
*
* @author xiejs
* @since 2022-02-15
*/
@FeignClient(contextId = "remoteWebmagicSinaFeign",
value = ServiceNameConstants.BUSINESS_WEBMAGIC_SERVICE,
fallbackFactory = RemoteWebmagicSinaFactory.class)
public interface RemoteWebmagicSinaFeign {
@GetMapping("/sina/taskForPRC")
R sinaTaskForPRC();
}

@ -0,0 +1,28 @@
package com.xjs.business.webmagic.factory;
import com.ruoyi.common.core.domain.R;
import com.xjs.business.webmagic.RemoteWebmagicSinaFeign;
import lombok.extern.log4j.Log4j2;
import org.springframework.cloud.openfeign.FallbackFactory;
import org.springframework.stereotype.Component;
/**
*
* @author xiejs
* @since 2022-02-15
*/
@Component
@Log4j2
public class RemoteWebmagicSinaFactory implements FallbackFactory<RemoteWebmagicSinaFeign> {
@Override
public RemoteWebmagicSinaFeign create(Throwable cause) {
return new RemoteWebmagicSinaFeign() {
@Override
public R sinaTaskForPRC() {
log.error("新浪 爬虫定时任务 降级");
return R.fail("降级处理");
}
};
}
}

@ -38,6 +38,12 @@ public class ServiceNameConstants
*/ */
public static final String BUSINESS_LOG_SERVICE= "xjs-log" ; public static final String BUSINESS_LOG_SERVICE= "xjs-log" ;
/**
* serviceid
*/
public static final String BUSINESS_WEBMAGIC_SERVICE= "xjs-webmagic" ;

@ -0,0 +1,39 @@
package com.xjs.job.task.webmagic;
import cn.hutool.core.date.DateUtil;
import com.ruoyi.common.core.domain.R;
import com.xjs.business.webmagic.RemoteWebmagicSinaFeign;
import lombok.extern.log4j.Log4j2;
import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.time.LocalDateTime;
import java.time.temporal.ChronoUnit;
/**
*
* @author xiejs
* @since 2022-02-15
*/
@Component("SinaTask")
@Log4j2
public class SinaTask {
@Resource
private RemoteWebmagicSinaFeign remoteWebmagicSinaFeign;
/**
*
*/
public void sinaNews() {
log.info("---------------爬虫-新浪新闻定时任务Start-------------------");
LocalDateTime localDateTime1 = DateUtil.date().toLocalDateTime();
R r = remoteWebmagicSinaFeign.sinaTaskForPRC();
log.info("爬虫-新浪新闻定时任务结果:code={},msg={},data={}",r.getCode(),r.getMsg(),r.getData());
LocalDateTime localDateTime2 = DateUtil.date().toLocalDateTime();
long between = ChronoUnit.MILLIS.between(localDateTime1, localDateTime2);
log.info("爬虫-新浪新闻定时任务Job耗费时间:{}ms", between);
log.info("---------------爬虫-新浪新闻定时任务end---------------------");
}
}

@ -0,0 +1,151 @@
package com.xjs.common.util;
import com.ruoyi.common.core.constant.HttpStatus;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
/**
*
*
* @author xiejs
* @since 2022-02-15
*/
@Component
public class HttpUtils {
private PoolingHttpClientConnectionManager cm;
public HttpUtils() {
this.cm = new PoolingHttpClientConnectionManager();
//设置最大连接数
this.cm.setMaxTotal(100);
//设置每个主机最大连接数
this.cm.setDefaultMaxPerRoute(10);
}
/**
*
*
* @param url
* @return
*/
public String doGetHtml(String url) {
//获取httpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
//设置httpGet请求对象设置url地址
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0");
//设置请求信息
httpGet.setConfig(this.getConfig());
//使用httpClient发起请求获取响应
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
//解析响应,获取结果
if (response.getStatusLine().getStatusCode() == HttpStatus.SUCCESS) {
//判断响应体entity是否不为空如果不为空就可以使用EntityUtils
if (response.getEntity() != null) {
return EntityUtils.toString(response.getEntity(),"utf-8");
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
//关闭response
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "";
}
/**
*
*
* @param url
* @return
*/
public String doGetImage(String url) {
//获取httpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
//设置httpGet请求对象设置url地址
HttpGet httpGet = new HttpGet(url);
//设置请求信息
httpGet.setConfig(this.getConfig());
//使用httpClient发起请求获取响应
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
//解析响应,获取结果
if (response.getStatusLine().getStatusCode() == HttpStatus.SUCCESS) {
//判断响应体entity是否不为空如果不为空就可以使用EntityUtils
if (response.getEntity() != null) {
//获取图片后缀
String extName = url.substring(url.lastIndexOf("."));
//创建图片名,重命名图片
String picName = UUID.randomUUID().toString()+extName;
//下载图片
OutputStream outputStream =new FileOutputStream("D:\\Dev\\WebCrawler\\jd\\image"+picName);
response.getEntity().writeTo(outputStream);
//返回图片名称
return picName;
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
//关闭response
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "";
}
/**
*
* @return RequestConfig
*/
private RequestConfig getConfig() {
return RequestConfig.custom()
.setConnectTimeout(2000)//创建连接的最长时间
.setConnectionRequestTimeout(1000)//获取连接的最长时间
.setSocketTimeout(10000)//数据传输的最长时间
.build();
}
}

@ -1,30 +0,0 @@
package com.xjs.handler;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
/**
*
* @author xiejs
* @since 2022-01-24
*/
public class MiHoYoRepoPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
@Override
public void process(Page page) {
}
@Override
public Site getSite() {
return site;
}
}

@ -0,0 +1,42 @@
package com.xjs.sina.controller;
import com.ruoyi.common.core.domain.R;
import com.xjs.sina.task.SinaTask;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
/**
* controller
* @author xiejs
* @since 2022-02-15
*/
@RestController
@RequestMapping("sina")
@Api(tags = "爬虫模块-新浪新闻")
public class SinaNewsController {
@Autowired
private SinaTask sinaTask;
//----------------------远程rpc调用---------------------------
@GetMapping("taskForPRC")
@ApiOperation("供定时任务服务RPC远程调用")
public R sinaTaskForPRC() {
sinaTask.reptileSinaNews();
return R.ok();
}
}

@ -0,0 +1,17 @@
package com.xjs.sina.mapper;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.xjs.sina.pojo.SinaNews;
/**
* @author xiejs
* @since 2022-02-15
*/
public interface SinaNewsMapper extends BaseMapper<SinaNews> {
/**
*
* @return int
*/
int deleteRepeatData();
}

@ -0,0 +1,39 @@
package com.xjs.sina.pojo;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
import java.io.Serializable;
import java.util.Date;
/**
* @author xiejs
* @since 2022-02-15
*/
@TableName("webmagic_sina_news")
@Data
public class SinaNews implements Serializable {
private static final long serialVersionUID = 1L;
@TableId
private Long id;
/**
*
*/
private String title;
/**
*
*/
private String category;
/**
*
*/
private String url;
private Date createTime;
}

@ -0,0 +1,17 @@
package com.xjs.sina.service;
import com.baomidou.mybatisplus.extension.service.IService;
import com.xjs.sina.pojo.SinaNews;
/**
* Service
* @author xiejs
* @since 2022-02-15
*/
public interface SinaNewsService extends IService<SinaNews> {
/**
*
* @return int
*/
int deleteRepeatData();
}

@ -0,0 +1,25 @@
package com.xjs.sina.service.impl;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.xjs.sina.mapper.SinaNewsMapper;
import com.xjs.sina.pojo.SinaNews;
import com.xjs.sina.service.SinaNewsService;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
/**
* Service
* @author xiejs
* @since 2022-02-15
*/
@Service
public class SinaNewsServiceImpl extends ServiceImpl<SinaNewsMapper, SinaNews> implements SinaNewsService {
@Resource
private SinaNewsMapper sinaNewsMapper;
@Override
public int deleteRepeatData() {
return sinaNewsMapper.deleteRepeatData();
}
}

@ -0,0 +1,176 @@
package com.xjs.sina.task;
import cn.hutool.core.collection.CollUtil;
import com.ruoyi.common.core.utils.StringUtils;
import com.xjs.common.util.HttpUtils;
import com.xjs.sina.pojo.SinaNews;
import com.xjs.sina.service.SinaNewsService;
import lombok.extern.log4j.Log4j2;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.util.*;
import java.util.stream.Collectors;
/**
*
* @author xiejs
* @since 2022-02-15
*/
@Component
@Log4j2
public class SinaTask {
@Autowired
private HttpUtils httpUtils;
@Autowired
private SinaNewsService sinaNewsService;
public void reptileSinaNews() {
try {
String url = "https://news.sina.com.cn/";
String html = httpUtils.doGetHtml(url);
Document document = Jsoup.parse(html);
this.parse(document);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* dom
*
* @param document dom
*/
private void parse(Document document) {
try {
//获取子链接
Elements nav_mod_1 = document.getElementsByClass("nav-mod-1");
Elements link = nav_mod_1.select("ul > li > a");
List<Map<String, String>> hrefList = link.stream().map(a -> {
String href = a.attr("href");
String text = a.text();
Map<String, String> map = new HashMap<>();
map.put(text, href);
return map;
}).collect(Collectors.toList());
hrefList.removeIf(s -> s.containsKey("javascript:;"));
for (Map<String, String> map : hrefList) {
Set<Map.Entry<String, String>> entrySet = map.entrySet();
for (Map.Entry<String, String> entry : entrySet) {
String html = httpUtils.doGetHtml(entry.getValue());
Document docChild = Jsoup.parse(html);
this.parseChile(docChild, entry.getKey());
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* dom
*
* @param docChild
* @param key key
*/
private void parseChile(Document docChild, String key) {
try {
Elements a = docChild.getElementsByTag("a");
ArrayList<String> link = new ArrayList<>();
for (Element element : a) {
String href = element.attr("href");
if (href.contains(".html") || href.contains(".shtml")) {
link.add(href);
}
}
ArrayList<SinaNews> sinaNewsList = new ArrayList<>();
//遍历每个文章页面,然后持久化到数据库
for (String url : link) {
//url不包含yyyy-dd- 直接跳过
if (!url.contains("-")) {
continue;
}
String html = httpUtils.doGetHtml(url);
Document document = Jsoup.parse(html);
Elements main_title = document.getElementsByClass("main-title");
Elements tit = document.getElementsByClass("tit");
Element artibodyTitle = document.getElementById("artibodyTitle");
Elements F_yahei = document.getElementsByClass("F-yahei");
Elements crt_h1 = document.select(".crticalcontent > h1");
Elements crth_h1 = document.select(".article-header > h1");
if (CollUtil.isNotEmpty(main_title)
|| CollUtil.isNotEmpty(tit)
|| artibodyTitle != null
|| CollUtil.isNotEmpty(F_yahei)
|| CollUtil.isNotEmpty(crt_h1)
|| CollUtil.isNotEmpty(crth_h1)) {
String title = null;
if (CollUtil.isNotEmpty(main_title)) {
title = main_title.text();
}
if (title == null) {
if (CollUtil.isNotEmpty(tit)) {
title = tit.text();
}
}
if (title == null) {
if (artibodyTitle != null) {
title = artibodyTitle.text();
}
}
if (title == null) {
if (CollUtil.isNotEmpty(F_yahei)) {
title = F_yahei.text();
}
}
if (title == null) {
if (CollUtil.isNotEmpty(crt_h1)) {
title = crt_h1.text();
}
}
if (title == null) {
if (CollUtil.isNotEmpty(crth_h1)) {
title = crth_h1.text();
}
}
if (StringUtils.isEmpty(title)) {
continue;
}
//持久化
SinaNews sinaNews = new SinaNews();
sinaNews.setCategory(key);
sinaNews.setTitle(title);
sinaNews.setUrl(url);
sinaNews.setCreateTime(new Date());
sinaNewsList.add(sinaNews);
}
}
sinaNewsService.saveBatch(sinaNewsList, 30);
//删除重复
int count = sinaNewsService.deleteRepeatData();
log.info("重复数据为:{}", count);
} catch (Exception e) {
e.printStackTrace();
}
}
}

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper
PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.xjs.sina.mapper.SinaNewsMapper">
<delete id="deleteRepeatData">
DELETE
FROM
webmagic_sina_news
WHERE
title IN ( SELECT t.title FROM ( SELECT title FROM webmagic_sina_news GROUP BY title HAVING count( title ) > 1 ) t )
AND id NOT IN (
SELECT
c.id
FROM
( SELECT min( id ) id FROM webmagic_sina_news GROUP BY title HAVING count( title )> 1 ) c
)
</delete>
</mapper>
Loading…
Cancel
Save