From fba7ae528eddc6b257efaf60dae319e720768b83 Mon Sep 17 00:00:00 2001 From: xjs <1294405880@qq.com> Date: Wed, 16 Feb 2022 22:24:19 +0800 Subject: [PATCH] =?UTF-8?q?1=E3=80=81=E4=BC=98=E5=8C=96=E5=88=A0=E9=99=A4?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=BA=93=E9=87=8D=E5=A4=8D=E6=95=B0=E6=8D=AE?= =?UTF-8?q?sql=E9=A2=84=E8=AD=A6=EF=BC=8C=E6=80=A7=E8=83=BD=E6=8F=90?= =?UTF-8?q?=E5=8D=87n=E5=80=8D=202=E3=80=81=E6=96=B0=E5=A2=9E=E7=88=AC?= =?UTF-8?q?=E8=99=AB=E7=88=AC=E8=99=AB=E6=96=87=E6=A1=88=E7=BD=91=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=B9=B6=E6=8C=81=E4=B9=85=E5=8C=96=E5=88=B0=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/xjs/job/task/CopyWritingTask.java | 2 +- .../openapi/ApiTopsearchAllnetworkMapper.xml | 11 +- .../openapi/ApiTopsearchBaiduMapper.xml | 11 +- .../openapi/ApiTopsearchDouyinMapper.xml | 11 +- .../openapi/ApiTopsearchWechatMapper.xml | 11 +- .../openapi/ApiTopsearchWeiboMapper.xml | 12 +- .../mapper/openapi/CopyWritingMapper.xml | 11 +- .../mapper/CopyWritingNetworkMapper.java | 18 +++ .../pojo/CopyWritingNetwork.java | 35 +++++ .../service/CopyWritingNetworkService.java | 17 +++ .../impl/CopyWritingNetworkServiceImpl.java | 25 ++++ .../task/CopyWritingNetworkTask.java | 124 ++++++++++++++++++ .../webmagic/CopyWritingNetworkMapper.xml | 15 +++ .../mapper/webmagic/SinaNewsMapper.xml | 11 +- 14 files changed, 299 insertions(+), 15 deletions(-) create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/mapper/CopyWritingNetworkMapper.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/pojo/CopyWritingNetwork.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/service/CopyWritingNetworkService.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/service/impl/CopyWritingNetworkServiceImpl.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/task/CopyWritingNetworkTask.java create mode 100644 xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/CopyWritingNetworkMapper.xml diff --git a/ruoyi-modules/ruoyi-job/src/main/java/com/xjs/job/task/CopyWritingTask.java b/ruoyi-modules/ruoyi-job/src/main/java/com/xjs/job/task/CopyWritingTask.java index 1165b3e7..0fda68a3 100644 --- a/ruoyi-modules/ruoyi-job/src/main/java/com/xjs/job/task/CopyWritingTask.java +++ b/ruoyi-modules/ruoyi-job/src/main/java/com/xjs/job/task/CopyWritingTask.java @@ -30,7 +30,7 @@ public class CopyWritingTask { */ public void execute() { log.info("---------------文案定时任务Start-------------------"); - for (int i = 0; i < 3; i++) { + for (int i = 0; i < 6; i++) { LocalDateTime localDateTime1 = DateUtil.date().toLocalDateTime(); R r = remoteCopyWritingFeign.copyWriting(); log.info("文案定时任务[{}]结果:code={},msg={},data={}",i,r.getCode(),r.getMsg(),r.getData()); diff --git a/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchAllnetworkMapper.xml b/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchAllnetworkMapper.xml index 0295589b..337d5ec6 100644 --- a/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchAllnetworkMapper.xml +++ b/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchAllnetworkMapper.xml @@ -4,7 +4,7 @@ "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> - + DELETE FROM api_topsearch_allnetwork @@ -18,4 +18,13 @@ ) + + delete from api_topsearch_allnetwork where id not in ( + SELECT + t.min_id + FROM + ( SELECT min( id ) AS min_id FROM api_topsearch_allnetwork GROUP BY title ) AS t + ) + + \ No newline at end of file diff --git a/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchBaiduMapper.xml b/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchBaiduMapper.xml index b3af21ac..976edadf 100644 --- a/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchBaiduMapper.xml +++ b/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchBaiduMapper.xml @@ -4,7 +4,7 @@ "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> - + DELETE FROM api_topsearch_baidu @@ -18,4 +18,13 @@ ) + + delete from api_topsearch_baidu where id not in ( + SELECT + t.min_id + FROM + ( SELECT min( id ) AS min_id FROM api_topsearch_baidu GROUP BY title ) AS t + ) + + \ No newline at end of file diff --git a/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchDouyinMapper.xml b/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchDouyinMapper.xml index 852d02cd..2a5633f4 100644 --- a/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchDouyinMapper.xml +++ b/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchDouyinMapper.xml @@ -4,7 +4,7 @@ "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> - + DELETE FROM api_topsearch_douyin @@ -18,4 +18,13 @@ ) + + delete from api_topsearch_douyin where id not in ( + SELECT + t.min_id + FROM + ( SELECT min( id ) AS min_id FROM api_topsearch_douyin GROUP BY word ) AS t + ) + + \ No newline at end of file diff --git a/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchWechatMapper.xml b/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchWechatMapper.xml index db640fee..a1d14c41 100644 --- a/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchWechatMapper.xml +++ b/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchWechatMapper.xml @@ -4,7 +4,7 @@ "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> - + DELETE FROM api_topsearch_wechat @@ -18,4 +18,13 @@ ) + + delete from api_topsearch_wechat where id not in ( + SELECT + t.min_id + FROM + ( SELECT min( id ) AS min_id FROM api_topsearch_wechat GROUP BY word ) AS t + ) + + \ No newline at end of file diff --git a/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchWeiboMapper.xml b/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchWeiboMapper.xml index 49827bb6..0777ba74 100644 --- a/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchWeiboMapper.xml +++ b/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/ApiTopsearchWeiboMapper.xml @@ -4,7 +4,7 @@ "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> - + DELETE FROM api_topsearch_weibo @@ -18,4 +18,14 @@ ) + + delete from api_topsearch_weibo where id not in ( + SELECT + t.min_id + FROM + ( SELECT min( id ) AS min_id FROM api_topsearch_weibo GROUP BY hotword ) AS t + ) + + + \ No newline at end of file diff --git a/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/CopyWritingMapper.xml b/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/CopyWritingMapper.xml index 6049ce20..dc9ed57a 100644 --- a/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/CopyWritingMapper.xml +++ b/xjs-business/xjs-business-openapi/src/main/resources/mapper/openapi/CopyWritingMapper.xml @@ -66,7 +66,7 @@ - + /*弃用--效率不行*/ DELETE FROM api_copywriting @@ -75,5 +75,14 @@ AND id NOT IN ( SELECT c.id FROM ( SELECT min( id ) id FROM api_copywriting GROUP BY content HAVING count( content )> 1 ) c ) + + delete from api_copywriting where id not in ( + SELECT + t.min_id + FROM + ( SELECT min( id ) AS min_id FROM api_copywriting GROUP BY content ) AS t + ) + + \ No newline at end of file diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/mapper/CopyWritingNetworkMapper.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/mapper/CopyWritingNetworkMapper.java new file mode 100644 index 00000000..a7df6688 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/mapper/CopyWritingNetworkMapper.java @@ -0,0 +1,18 @@ +package com.xjs.copywritingNetwork.mapper; + +import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import com.xjs.copywritingNetwork.pojo.CopyWritingNetwork; + +/** + * 文案网mapper + * @author xiejs + * @since 2022-02-16 + */ +public interface CopyWritingNetworkMapper extends BaseMapper { + + /** + * 删除重复数据 + * @return int + */ + int deleteRepeatData(); +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/pojo/CopyWritingNetwork.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/pojo/CopyWritingNetwork.java new file mode 100644 index 00000000..be30f688 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/pojo/CopyWritingNetwork.java @@ -0,0 +1,35 @@ +package com.xjs.copywritingNetwork.pojo; + +import com.baomidou.mybatisplus.annotation.FieldFill; +import com.baomidou.mybatisplus.annotation.TableField; +import com.baomidou.mybatisplus.annotation.TableName; +import lombok.Data; + +import java.io.Serializable; +import java.util.Date; + +/** + * 文案网数据实体表 + * @author xiejs + * @since 2022-02-16 + */ +@Data +@TableName("webmagic_copywriting_network") +public class CopyWritingNetwork implements Serializable { + private static final long serialVersionUID = 1L; + + private Long id; + + /** 文案标签 */ + private String type; + + /** 文案主题 */ + private String theme; + + /** 文案内容 */ + private String content; + + /** 创建时间 */ + @TableField(fill = FieldFill.INSERT) + private Date createTime; +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/service/CopyWritingNetworkService.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/service/CopyWritingNetworkService.java new file mode 100644 index 00000000..14fa2fb0 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/service/CopyWritingNetworkService.java @@ -0,0 +1,17 @@ +package com.xjs.copywritingNetwork.service; + +import com.baomidou.mybatisplus.extension.service.IService; +import com.xjs.copywritingNetwork.pojo.CopyWritingNetwork; + +/** + * 文案网service接口 + * @author xiejs + * @since 2022-02-16 + */ +public interface CopyWritingNetworkService extends IService { + /** + * 删除重复数据 + * @return int + */ + int deleteRepeatData(); +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/service/impl/CopyWritingNetworkServiceImpl.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/service/impl/CopyWritingNetworkServiceImpl.java new file mode 100644 index 00000000..826ec40b --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/service/impl/CopyWritingNetworkServiceImpl.java @@ -0,0 +1,25 @@ +package com.xjs.copywritingNetwork.service.impl; + +import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; +import com.xjs.copywritingNetwork.mapper.CopyWritingNetworkMapper; +import com.xjs.copywritingNetwork.pojo.CopyWritingNetwork; +import com.xjs.copywritingNetwork.service.CopyWritingNetworkService; +import org.springframework.stereotype.Service; + +import javax.annotation.Resource; + +/** + * @author xiejs + * @since 2022-02-16 + */ +@Service +public class CopyWritingNetworkServiceImpl extends ServiceImpl implements CopyWritingNetworkService { + + @Resource + private CopyWritingNetworkMapper copyWritingNetworkMapper; + + @Override + public int deleteRepeatData() { + return copyWritingNetworkMapper.deleteRepeatData(); + } +} diff --git a/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/task/CopyWritingNetworkTask.java b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/task/CopyWritingNetworkTask.java new file mode 100644 index 00000000..a5f60d02 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/java/com/xjs/copywritingNetwork/task/CopyWritingNetworkTask.java @@ -0,0 +1,124 @@ +package com.xjs.copywritingNetwork.task; + +import com.xjs.common.util.HttpUtils; +import com.xjs.copywritingNetwork.pojo.CopyWritingNetwork; +import com.xjs.copywritingNetwork.service.CopyWritingNetworkService; +import lombok.extern.log4j.Log4j2; +import org.apache.commons.lang3.StringUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.scheduling.annotation.Scheduled; +import org.springframework.stereotype.Component; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Pattern; + +/** + * 文案网爬虫任务 url:https://www.wenanwang.com/ + * + * @author xiejs + * @since 2022-02-16 + */ +@Component +@Log4j2 +public class CopyWritingNetworkTask { + + @Autowired + private HttpUtils httpUtils; + @Autowired + private CopyWritingNetworkService copyWritingNetworkService; + + + public static final String URL = "https://www.wenanwang.com/"; + + private static Pattern pattern = Pattern.compile("[0-9]*"); + + @Scheduled(fixedDelay = 1000 * 5) + public void reptileCopyWriting() { + try { + String html = httpUtils.doGetHtml(URL); + + Document document = Jsoup.parse(html); + + this.parseHtmlGetUrl(document); + } catch (Exception e) { + e.printStackTrace(); + }finally { + int i = copyWritingNetworkService.deleteRepeatData(); + log.info("删除文案网数据重复数:"+i); + } + } + + /** + * 解析html获取url + * + * @param document + */ + private void parseHtmlGetUrl(Document document) { + Elements zyzt = document.getElementsByClass("zyzt"); + + Map map = new HashMap<>(); + for (Element element : zyzt) { + Elements a = element.select("a"); + for (Element elementA : a) { + String text = elementA.text(); + String href = elementA.attr("href"); + map.put(text, href); + } + } + + this.parseHtmlGetCopyWriting(map); + + } + + /** + * 解析html获取文案内容并持久化 + * + * @param map 存放了url和名称 + */ + private void parseHtmlGetCopyWriting(Map map) { + ArrayList copyWritingNetworks = new ArrayList<>(); + + for (Map.Entry entry : map.entrySet()) { + String url = entry.getValue(); + String html = httpUtils.doGetHtml(url); + Document document = Jsoup.parse(html); + Elements a = document.select(".newslist li h5 a"); + for (Element element : a) { + + String href = element.attr("href"); + String newUrl = URL + href; + + String cw = httpUtils.doGetHtml(newUrl); + Document cwDocument = Jsoup.parse(cw); + Elements h1 = cwDocument.select(".newsview > h1"); + String theme = null; + for (Element cH1 : h1) { + theme = cH1.text(); + } + Elements ps = cwDocument.select(".content > p"); + for (Element p : ps) { + CopyWritingNetwork copyWritingNetwork = new CopyWritingNetwork(); + copyWritingNetwork.setTheme(theme); + copyWritingNetwork.setContent(p.text()); + copyWritingNetwork.setType(entry.getKey()); + String content = copyWritingNetwork.getContent(); + boolean matches = pattern.matcher(content).matches(); + if (StringUtils.isNotEmpty(content) && !matches) { + copyWritingNetworks.add(copyWritingNetwork); + } + } + } + } + + copyWritingNetworkService.saveBatch(copyWritingNetworks, 20); + + } + + +} diff --git a/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/CopyWritingNetworkMapper.xml b/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/CopyWritingNetworkMapper.xml new file mode 100644 index 00000000..804bbe60 --- /dev/null +++ b/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/CopyWritingNetworkMapper.xml @@ -0,0 +1,15 @@ + + + + + + delete from webmagic_copywriting_network where id not in ( + SELECT + t.min_id + FROM + ( SELECT min( id ) AS min_id FROM webmagic_copywriting_network GROUP BY content ) AS t + ) + + \ No newline at end of file diff --git a/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/SinaNewsMapper.xml b/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/SinaNewsMapper.xml index ae269649..fdb3f12a 100644 --- a/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/SinaNewsMapper.xml +++ b/xjs-business/xjs-business-webmagic/src/main/resources/mapper/webmagic/SinaNewsMapper.xml @@ -38,16 +38,11 @@ - DELETE - FROM - webmagic_sina_news - WHERE - title IN ( SELECT t.title FROM ( SELECT title FROM webmagic_sina_news GROUP BY title HAVING count( title ) > 1 ) t ) - AND id NOT IN ( + delete from webmagic_sina_news where id not in ( SELECT - c.id + t.min_id FROM - ( SELECT min( id ) id FROM webmagic_sina_news GROUP BY title HAVING count( title )> 1 ) c + ( SELECT min( id ) AS min_id FROM webmagic_sina_news GROUP BY title ) AS t ) \ No newline at end of file