From 1b0625318ca868a2afc6a758725be0d2bf565d2e Mon Sep 17 00:00:00 2001 From: xjs <1294405880@qq.com> Date: Tue, 15 Feb 2022 20:18:52 +0800 Subject: [PATCH] =?UTF-8?q?1=E3=80=81=E7=88=AC=E8=99=AB=E6=9C=8D=E5=8A=A1?= =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E6=96=B0=E6=B5=AA=E6=96=B0=E9=97=BB=E7=88=AC?= =?UTF-8?q?=E8=99=AB=E5=8A=9F=E8=83=BD=202=E3=80=81=E8=BF=9C=E7=A8=8B?= =?UTF-8?q?=E5=AE=9A=E6=97=B6=E4=BB=BB=E5=8A=A1=E8=B0=83=E7=94=A8=E6=96=B0?= =?UTF-8?q?=E6=B5=AA=E7=88=AC=E8=99=AB=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../warning/RemoteWarningCRUDFeign.java | 4 +- .../webmagic/RemoteWebmagicSinaFeign.java | 21 +++ .../factory/RemoteWebmagicSinaFactory.java | 28 +++ .../core/constant/ServiceNameConstants.java | 6 + .../com/xjs/job/task/webmagic/SinaTask.java | 39 ++++ .../java/com/xjs/common/util/HttpUtils.java | 151 +++++++++++++++ .../xjs/handler/MiHoYoRepoPageProcessor.java | 30 --- .../sina/controller/SinaNewsController.java | 42 +++++ .../com/xjs/sina/mapper/SinaNewsMapper.java | 17 ++ .../main/java/com/xjs/sina/pojo/SinaNews.java | 39 ++++ .../com/xjs/sina/service/SinaNewsService.java | 17 ++ .../service/impl/SinaNewsServiceImpl.java | 25 +++ .../main/java/com/xjs/sina/task/SinaTask.java | 176 ++++++++++++++++++ .../mapper/webmagic/SinaNewsMapper.xml | 21 +++ 14 files changed, 584 insertions(+), 32 deletions(-) create mode 100644 ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/RemoteWebmagicSinaFeign.java create mode 100644 ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/factory/RemoteWebmagicSinaFactory.java create mode 100644 ruoyi-modules/ruoyi-job/src/main/java/com/xjs/job/task/webmagic/SinaTask.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/common/util/HttpUtils.java delete mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/handler/MiHoYoRepoPageProcessor.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/controller/SinaNewsController.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/mapper/SinaNewsMapper.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/pojo/SinaNews.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/service/SinaNewsService.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/service/impl/SinaNewsServiceImpl.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/task/SinaTask.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/SinaNewsMapper.xml diff --git a/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/warning/RemoteWarningCRUDFeign.java b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/warning/RemoteWarningCRUDFeign.java index f8c4cca6..eadd2d8e 100644 --- a/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/warning/RemoteWarningCRUDFeign.java +++ b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/warning/RemoteWarningCRUDFeign.java @@ -16,9 +16,9 @@ import org.springframework.web.bind.annotation.RequestBody; import java.util.List; /** + * 远程rpc调用预警服务crud接口 * @author xiejs - * @desc 远程rpc调用预警服务crud接口 - * @create 2021-12-31 + * @since 2021-12-31 */ @FeignClient(contextId = "remoteWarningCRUDFeign", value = ServiceNameConstants.BUSINESS_WARNING_SERVICE, diff --git a/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/RemoteWebmagicSinaFeign.java b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/RemoteWebmagicSinaFeign.java new file mode 100644 index 00000000..992711c3 --- /dev/null +++ b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/RemoteWebmagicSinaFeign.java @@ -0,0 +1,21 @@ +package com.xjs.business.webmagic; + +import com.ruoyi.common.core.constant.ServiceNameConstants; +import com.ruoyi.common.core.domain.R; +import com.xjs.business.webmagic.factory.RemoteWebmagicSinaFactory; +import org.springframework.cloud.openfeign.FeignClient; +import org.springframework.web.bind.annotation.GetMapping; + +/** + * 内部 调用 新浪 爬虫定时任务 + * @author xiejs + * @since 2022-02-15 + */ +@FeignClient(contextId = "remoteWebmagicSinaFeign", + value = ServiceNameConstants.BUSINESS_WEBMAGIC_SERVICE, + fallbackFactory = RemoteWebmagicSinaFactory.class) +public interface RemoteWebmagicSinaFeign { + + @GetMapping("/sina/taskForPRC") + R sinaTaskForPRC(); +} diff --git a/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/factory/RemoteWebmagicSinaFactory.java b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/factory/RemoteWebmagicSinaFactory.java new file mode 100644 index 00000000..c598d223 --- /dev/null +++ b/ruoyi-api/ruoyi-api-system/src/main/java/com/xjs/business/webmagic/factory/RemoteWebmagicSinaFactory.java @@ -0,0 +1,28 @@ +package com.xjs.business.webmagic.factory; + +import com.ruoyi.common.core.domain.R; +import com.xjs.business.webmagic.RemoteWebmagicSinaFeign; +import lombok.extern.log4j.Log4j2; +import org.springframework.cloud.openfeign.FallbackFactory; +import org.springframework.stereotype.Component; + +/** + * 内部 调用 新浪 爬虫定时任务 降级 + * @author xiejs + * @since 2022-02-15 + */ +@Component +@Log4j2 +public class RemoteWebmagicSinaFactory implements FallbackFactory { + @Override + public RemoteWebmagicSinaFeign create(Throwable cause) { + + return new RemoteWebmagicSinaFeign() { + @Override + public R sinaTaskForPRC() { + log.error("新浪 爬虫定时任务 降级"); + return R.fail("降级处理"); + } + }; + } +} diff --git a/ruoyi-common/ruoyi-common-core/src/main/java/com/ruoyi/common/core/constant/ServiceNameConstants.java b/ruoyi-common/ruoyi-common-core/src/main/java/com/ruoyi/common/core/constant/ServiceNameConstants.java index f3735eed..97c0acf3 100644 --- a/ruoyi-common/ruoyi-common-core/src/main/java/com/ruoyi/common/core/constant/ServiceNameConstants.java +++ b/ruoyi-common/ruoyi-common-core/src/main/java/com/ruoyi/common/core/constant/ServiceNameConstants.java @@ -38,6 +38,12 @@ public class ServiceNameConstants */ public static final String BUSINESS_LOG_SERVICE= "xjs-log" ; + /** + * 爬虫服务的serviceid + */ + public static final String BUSINESS_WEBMAGIC_SERVICE= "xjs-webmagic" ; + + diff --git a/ruoyi-modules/ruoyi-job/src/main/java/com/xjs/job/task/webmagic/SinaTask.java b/ruoyi-modules/ruoyi-job/src/main/java/com/xjs/job/task/webmagic/SinaTask.java new file mode 100644 index 00000000..4f831407 --- /dev/null +++ b/ruoyi-modules/ruoyi-job/src/main/java/com/xjs/job/task/webmagic/SinaTask.java @@ -0,0 +1,39 @@ +package com.xjs.job.task.webmagic; + +import cn.hutool.core.date.DateUtil; +import com.ruoyi.common.core.domain.R; +import com.xjs.business.webmagic.RemoteWebmagicSinaFeign; +import lombok.extern.log4j.Log4j2; +import org.springframework.stereotype.Component; + +import javax.annotation.Resource; +import java.time.LocalDateTime; +import java.time.temporal.ChronoUnit; + +/** + * 爬虫 新浪新闻 定时任务 + * @author xiejs + * @since 2022-02-15 + */ +@Component("SinaTask") +@Log4j2 +public class SinaTask { + @Resource + private RemoteWebmagicSinaFeign remoteWebmagicSinaFeign; + + /** + * 任务执行 + */ + public void sinaNews() { + log.info("---------------爬虫-新浪新闻定时任务Start-------------------"); + LocalDateTime localDateTime1 = DateUtil.date().toLocalDateTime(); + + R r = remoteWebmagicSinaFeign.sinaTaskForPRC(); + + log.info("爬虫-新浪新闻定时任务结果:code={},msg={},data={}",r.getCode(),r.getMsg(),r.getData()); + LocalDateTime localDateTime2 = DateUtil.date().toLocalDateTime(); + long between = ChronoUnit.MILLIS.between(localDateTime1, localDateTime2); + log.info("爬虫-新浪新闻定时任务Job耗费时间:{}ms", between); + log.info("---------------爬虫-新浪新闻定时任务end---------------------"); + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/common/util/HttpUtils.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/common/util/HttpUtils.java new file mode 100644 index 00000000..bf454a00 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/common/util/HttpUtils.java @@ -0,0 +1,151 @@ +package com.xjs.common.util; + +import com.ruoyi.common.core.constant.HttpStatus; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; +import org.apache.http.util.EntityUtils; +import org.springframework.stereotype.Component; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.UUID; + +/** + * 爬虫工具 + * + * @author xiejs + * @since 2022-02-15 + */ +@Component +public class HttpUtils { + + private PoolingHttpClientConnectionManager cm; + + public HttpUtils() { + this.cm = new PoolingHttpClientConnectionManager(); + + //设置最大连接数 + this.cm.setMaxTotal(100); + //设置每个主机最大连接数 + this.cm.setDefaultMaxPerRoute(10); + } + + /** + * 根据请求地址下载页面数据 + * + * @param url 地址 + * @return 页面数据 + */ + public String doGetHtml(String url) { + //获取httpClient对象 + CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build(); + + //设置httpGet请求对象,设置url地址 + HttpGet httpGet = new HttpGet(url); + + httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0"); + + //设置请求信息 + httpGet.setConfig(this.getConfig()); + + + + //使用httpClient发起请求,获取响应 + CloseableHttpResponse response = null; + try { + response = httpClient.execute(httpGet); + //解析响应,获取结果 + if (response.getStatusLine().getStatusCode() == HttpStatus.SUCCESS) { + //判断响应体entity是否不为空,如果不为空就可以使用EntityUtils + if (response.getEntity() != null) { + return EntityUtils.toString(response.getEntity(),"utf-8"); + } + } + } catch (IOException e) { + e.printStackTrace(); + } finally { + //关闭response + if (response != null) { + try { + response.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + return ""; + } + + + + /** + * 根据请求地址下载图片 + * + * @param url 地址 + * @return 图片名称 + */ + public String doGetImage(String url) { + //获取httpClient对象 + CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build(); + + //设置httpGet请求对象,设置url地址 + HttpGet httpGet = new HttpGet(url); + + //设置请求信息 + httpGet.setConfig(this.getConfig()); + + //使用httpClient发起请求,获取响应 + CloseableHttpResponse response = null; + try { + response = httpClient.execute(httpGet); + //解析响应,获取结果 + if (response.getStatusLine().getStatusCode() == HttpStatus.SUCCESS) { + //判断响应体entity是否不为空,如果不为空就可以使用EntityUtils + if (response.getEntity() != null) { + //获取图片后缀 + String extName = url.substring(url.lastIndexOf(".")); + + //创建图片名,重命名图片 + String picName = UUID.randomUUID().toString()+extName; + + //下载图片 + OutputStream outputStream =new FileOutputStream("D:\\Dev\\WebCrawler\\jd\\image"+picName); + response.getEntity().writeTo(outputStream); + + //返回图片名称 + return picName; + } + } + } catch (IOException e) { + e.printStackTrace(); + } finally { + //关闭response + if (response != null) { + try { + response.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + return ""; + } + + + /** + * 设置请求配置 + * @return RequestConfig + */ + private RequestConfig getConfig() { + return RequestConfig.custom() + .setConnectTimeout(2000)//创建连接的最长时间 + .setConnectionRequestTimeout(1000)//获取连接的最长时间 + .setSocketTimeout(10000)//数据传输的最长时间 + .build(); + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/handler/MiHoYoRepoPageProcessor.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/handler/MiHoYoRepoPageProcessor.java deleted file mode 100644 index 4b7f4cd1..00000000 --- a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/handler/MiHoYoRepoPageProcessor.java +++ /dev/null @@ -1,30 +0,0 @@ -package com.xjs.handler; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * 米哈游社区爬取资源 - * @author xiejs - * @since 2022-01-24 - */ -public class MiHoYoRepoPageProcessor implements PageProcessor { - - - private Site site = Site.me().setRetryTimes(3).setSleepTime(100); - - - @Override - public void process(Page page) { - - } - - @Override - public Site getSite() { - return site; - } - - - -} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/controller/SinaNewsController.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/controller/SinaNewsController.java new file mode 100644 index 00000000..39a968db --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/controller/SinaNewsController.java @@ -0,0 +1,42 @@ +package com.xjs.sina.controller; + +import com.ruoyi.common.core.domain.R; +import com.xjs.sina.task.SinaTask; +import io.swagger.annotations.Api; +import io.swagger.annotations.ApiOperation; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RestController; + +/** + * 新浪爬虫数据controller + * @author xiejs + * @since 2022-02-15 + */ +@RestController +@RequestMapping("sina") +@Api(tags = "爬虫模块-新浪新闻") +public class SinaNewsController { + @Autowired + private SinaTask sinaTask; + + + + + + + + + + + + //----------------------远程rpc调用--------------------------- + @GetMapping("taskForPRC") + @ApiOperation("供定时任务服务RPC远程调用") + public R sinaTaskForPRC() { + sinaTask.reptileSinaNews(); + return R.ok(); + } + +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/mapper/SinaNewsMapper.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/mapper/SinaNewsMapper.java new file mode 100644 index 00000000..88a754d4 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/mapper/SinaNewsMapper.java @@ -0,0 +1,17 @@ +package com.xjs.sina.mapper; + +import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import com.xjs.sina.pojo.SinaNews; + +/** + * @author xiejs + * @since 2022-02-15 + */ +public interface SinaNewsMapper extends BaseMapper { + + /** + * 删除重复数据 + * @return int + */ + int deleteRepeatData(); +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/pojo/SinaNews.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/pojo/SinaNews.java new file mode 100644 index 00000000..a78ddad5 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/pojo/SinaNews.java @@ -0,0 +1,39 @@ +package com.xjs.sina.pojo; + +import com.baomidou.mybatisplus.annotation.TableId; +import com.baomidou.mybatisplus.annotation.TableName; +import lombok.Data; + +import java.io.Serializable; +import java.util.Date; + +/** + * @author xiejs + * @since 2022-02-15 + */ +@TableName("webmagic_sina_news") +@Data +public class SinaNews implements Serializable { + + private static final long serialVersionUID = 1L; + + @TableId + private Long id; + + /** + * 新闻标题 + */ + private String title; + + /** + * 新闻分类 + */ + private String category; + + /** + * 新闻地址 + */ + private String url; + + private Date createTime; +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/service/SinaNewsService.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/service/SinaNewsService.java new file mode 100644 index 00000000..308f6a5d --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/service/SinaNewsService.java @@ -0,0 +1,17 @@ +package com.xjs.sina.service; + +import com.baomidou.mybatisplus.extension.service.IService; +import com.xjs.sina.pojo.SinaNews; + +/** + * 新浪新闻爬虫Service接口 + * @author xiejs + * @since 2022-02-15 + */ +public interface SinaNewsService extends IService { + /** + * 删除重复数据 + * @return int + */ + int deleteRepeatData(); +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/service/impl/SinaNewsServiceImpl.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/service/impl/SinaNewsServiceImpl.java new file mode 100644 index 00000000..739629c9 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/service/impl/SinaNewsServiceImpl.java @@ -0,0 +1,25 @@ +package com.xjs.sina.service.impl; + +import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; +import com.xjs.sina.mapper.SinaNewsMapper; +import com.xjs.sina.pojo.SinaNews; +import com.xjs.sina.service.SinaNewsService; +import org.springframework.stereotype.Service; + +import javax.annotation.Resource; + +/** + * 新浪新闻爬虫Service接口实现 + * @author xiejs + * @since 2022-02-15 + */ +@Service +public class SinaNewsServiceImpl extends ServiceImpl implements SinaNewsService { + @Resource + private SinaNewsMapper sinaNewsMapper; + + @Override + public int deleteRepeatData() { + return sinaNewsMapper.deleteRepeatData(); + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/task/SinaTask.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/task/SinaTask.java new file mode 100644 index 00000000..f7301bf4 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/sina/task/SinaTask.java @@ -0,0 +1,176 @@ +package com.xjs.sina.task; + +import cn.hutool.core.collection.CollUtil; +import com.ruoyi.common.core.utils.StringUtils; +import com.xjs.common.util.HttpUtils; +import com.xjs.sina.pojo.SinaNews; +import com.xjs.sina.service.SinaNewsService; +import lombok.extern.log4j.Log4j2; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * 新浪新闻爬虫任务 + * @author xiejs + * @since 2022-02-15 + */ +@Component +@Log4j2 +public class SinaTask { + + @Autowired + private HttpUtils httpUtils; + @Autowired + private SinaNewsService sinaNewsService; + + public void reptileSinaNews() { + try { + String url = "https://news.sina.com.cn/"; + + String html = httpUtils.doGetHtml(url); + + Document document = Jsoup.parse(html); + + this.parse(document); + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * 解析dom + * + * @param document dom + */ + private void parse(Document document) { + try { + //获取子链接 + Elements nav_mod_1 = document.getElementsByClass("nav-mod-1"); + Elements link = nav_mod_1.select("ul > li > a"); + List> hrefList = link.stream().map(a -> { + String href = a.attr("href"); + String text = a.text(); + Map map = new HashMap<>(); + map.put(text, href); + return map; + }).collect(Collectors.toList()); + hrefList.removeIf(s -> s.containsKey("javascript:;")); + + for (Map map : hrefList) { + Set> entrySet = map.entrySet(); + for (Map.Entry entry : entrySet) { + String html = httpUtils.doGetHtml(entry.getValue()); + Document docChild = Jsoup.parse(html); + this.parseChile(docChild, entry.getKey()); + } + } + } catch (Exception e) { + e.printStackTrace(); + } + + } + + /** + * 解析子dom + * + * @param docChild 子 + * @param key key + */ + private void parseChile(Document docChild, String key) { + try { + Elements a = docChild.getElementsByTag("a"); + ArrayList link = new ArrayList<>(); + for (Element element : a) { + String href = element.attr("href"); + if (href.contains(".html") || href.contains(".shtml")) { + link.add(href); + } + } + + ArrayList sinaNewsList = new ArrayList<>(); + + //遍历每个文章页面,然后持久化到数据库 + for (String url : link) { + //url不包含yyyy-dd- 直接跳过 + if (!url.contains("-")) { + continue; + } + String html = httpUtils.doGetHtml(url); + Document document = Jsoup.parse(html); + Elements main_title = document.getElementsByClass("main-title"); + Elements tit = document.getElementsByClass("tit"); + Element artibodyTitle = document.getElementById("artibodyTitle"); + Elements F_yahei = document.getElementsByClass("F-yahei"); + Elements crt_h1 = document.select(".crticalcontent > h1"); + Elements crth_h1 = document.select(".article-header > h1"); + + + if (CollUtil.isNotEmpty(main_title) + || CollUtil.isNotEmpty(tit) + || artibodyTitle != null + || CollUtil.isNotEmpty(F_yahei) + || CollUtil.isNotEmpty(crt_h1) + || CollUtil.isNotEmpty(crth_h1)) { + String title = null; + if (CollUtil.isNotEmpty(main_title)) { + title = main_title.text(); + } + if (title == null) { + if (CollUtil.isNotEmpty(tit)) { + title = tit.text(); + } + } + if (title == null) { + if (artibodyTitle != null) { + title = artibodyTitle.text(); + } + } + if (title == null) { + if (CollUtil.isNotEmpty(F_yahei)) { + title = F_yahei.text(); + } + } + if (title == null) { + if (CollUtil.isNotEmpty(crt_h1)) { + title = crt_h1.text(); + } + } + if (title == null) { + if (CollUtil.isNotEmpty(crth_h1)) { + title = crth_h1.text(); + } + } + + if (StringUtils.isEmpty(title)) { + continue; + } + + //持久化 + SinaNews sinaNews = new SinaNews(); + sinaNews.setCategory(key); + sinaNews.setTitle(title); + sinaNews.setUrl(url); + sinaNews.setCreateTime(new Date()); + + sinaNewsList.add(sinaNews); + } + } + sinaNewsService.saveBatch(sinaNewsList, 30); + + //删除重复 + int count = sinaNewsService.deleteRepeatData(); + log.info("重复数据为:{}", count); + + } catch (Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/SinaNewsMapper.xml b/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/SinaNewsMapper.xml new file mode 100644 index 00000000..d89e6871 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/SinaNewsMapper.xml @@ -0,0 +1,21 @@ + + + + + + + DELETE + FROM + webmagic_sina_news + WHERE + title IN ( SELECT t.title FROM ( SELECT title FROM webmagic_sina_news GROUP BY title HAVING count( title ) > 1 ) t ) + AND id NOT IN ( + SELECT + c.id + FROM + ( SELECT min( id ) id FROM webmagic_sina_news GROUP BY title HAVING count( title )> 1 ) c + ) + + \ No newline at end of file