1、优化删除数据库重复数据sql预警,性能提升n倍

2、新增爬虫爬虫文案网数据并持久化到数据库
pull/254/head
xjs 4 years ago
parent 6bc9887cc9
commit fba7ae528e

@ -30,7 +30,7 @@ public class CopyWritingTask {
*/
public void execute() {
log.info("---------------文案定时任务Start-------------------");
for (int i = 0; i < 3; i++) {
for (int i = 0; i < 6; i++) {
LocalDateTime localDateTime1 = DateUtil.date().toLocalDateTime();
R<CopyWriting> r = remoteCopyWritingFeign.copyWriting();
log.info("文案定时任务[{}]结果:code={},msg={},data={}",i,r.getCode(),r.getMsg(),r.getData());

@ -4,7 +4,7 @@
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.xjs.topsearch.mapper.ApiTopsearchAllnetworkMapper">
<delete id="deleteRepeatData">
<delete id="deleteRepeatData_copy">
DELETE
FROM
api_topsearch_allnetwork
@ -18,4 +18,13 @@
)
</delete>
<delete id="deleteRepeatData">
delete from api_topsearch_allnetwork where id not in (
SELECT
t.min_id
FROM
( SELECT min( id ) AS min_id FROM api_topsearch_allnetwork GROUP BY title ) AS t
)
</delete>
</mapper>

@ -4,7 +4,7 @@
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.xjs.topsearch.mapper.ApiTopsearchBaiduMapper">
<delete id="deleteRepeatData">
<delete id="deleteRepeatData_copy">
DELETE
FROM
api_topsearch_baidu
@ -18,4 +18,13 @@
)
</delete>
<delete id="deleteRepeatData">
delete from api_topsearch_baidu where id not in (
SELECT
t.min_id
FROM
( SELECT min( id ) AS min_id FROM api_topsearch_baidu GROUP BY title ) AS t
)
</delete>
</mapper>

@ -4,7 +4,7 @@
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.xjs.topsearch.mapper.ApiTopsearchDouyinMapper">
<delete id="deleteRepeatData">
<delete id="deleteRepeatData_copy">
DELETE
FROM
api_topsearch_douyin
@ -18,4 +18,13 @@
)
</delete>
<delete id="deleteRepeatData">
delete from api_topsearch_douyin where id not in (
SELECT
t.min_id
FROM
( SELECT min( id ) AS min_id FROM api_topsearch_douyin GROUP BY word ) AS t
)
</delete>
</mapper>

@ -4,7 +4,7 @@
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.xjs.topsearch.mapper.ApiTopsearchWechatMapper">
<delete id="deleteRepeatData">
<delete id="deleteRepeatData_copy">
DELETE
FROM
api_topsearch_wechat
@ -18,4 +18,13 @@
)
</delete>
<delete id="deleteRepeatData">
delete from api_topsearch_wechat where id not in (
SELECT
t.min_id
FROM
( SELECT min( id ) AS min_id FROM api_topsearch_wechat GROUP BY word ) AS t
)
</delete>
</mapper>

@ -4,7 +4,7 @@
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.xjs.topsearch.mapper.ApiTopsearchWeiboMapper">
<delete id="deleteRepeatData">
<delete id="deleteRepeatData_copy">
DELETE
FROM
api_topsearch_weibo
@ -18,4 +18,14 @@
)
</delete>
<delete id="deleteRepeatData">
delete from api_topsearch_weibo where id not in (
SELECT
t.min_id
FROM
( SELECT min( id ) AS min_id FROM api_topsearch_weibo GROUP BY hotword ) AS t
)
</delete>
</mapper>

@ -66,7 +66,7 @@
</foreach>
</delete>
<delete id="deleteRepeatData">
<delete id="deleteRepeatData_copy">/*弃用--效率不行*/
DELETE
FROM
api_copywriting
@ -75,5 +75,14 @@
AND id NOT IN ( SELECT c.id FROM ( SELECT min( id ) id FROM api_copywriting GROUP BY content HAVING count( content )> 1 ) c )
</delete>
<delete id="deleteRepeatData">
delete from api_copywriting where id not in (
SELECT
t.min_id
FROM
( SELECT min( id ) AS min_id FROM api_copywriting GROUP BY content ) AS t
)
</delete>
</mapper>

@ -0,0 +1,18 @@
package com.xjs.copywritingNetwork.mapper;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.xjs.copywritingNetwork.pojo.CopyWritingNetwork;
/**
* mapper
* @author xiejs
* @since 2022-02-16
*/
public interface CopyWritingNetworkMapper extends BaseMapper<CopyWritingNetwork> {
/**
*
* @return int
*/
int deleteRepeatData();
}

@ -0,0 +1,35 @@
package com.xjs.copywritingNetwork.pojo;
import com.baomidou.mybatisplus.annotation.FieldFill;
import com.baomidou.mybatisplus.annotation.TableField;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
import java.io.Serializable;
import java.util.Date;
/**
*
* @author xiejs
* @since 2022-02-16
*/
@Data
@TableName("webmagic_copywriting_network")
public class CopyWritingNetwork implements Serializable {
private static final long serialVersionUID = 1L;
private Long id;
/** 文案标签 */
private String type;
/** 文案主题 */
private String theme;
/** 文案内容 */
private String content;
/** 创建时间 */
@TableField(fill = FieldFill.INSERT)
private Date createTime;
}

@ -0,0 +1,17 @@
package com.xjs.copywritingNetwork.service;
import com.baomidou.mybatisplus.extension.service.IService;
import com.xjs.copywritingNetwork.pojo.CopyWritingNetwork;
/**
* service
* @author xiejs
* @since 2022-02-16
*/
public interface CopyWritingNetworkService extends IService<CopyWritingNetwork> {
/**
*
* @return int
*/
int deleteRepeatData();
}

@ -0,0 +1,25 @@
package com.xjs.copywritingNetwork.service.impl;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.xjs.copywritingNetwork.mapper.CopyWritingNetworkMapper;
import com.xjs.copywritingNetwork.pojo.CopyWritingNetwork;
import com.xjs.copywritingNetwork.service.CopyWritingNetworkService;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
/**
* @author xiejs
* @since 2022-02-16
*/
@Service
public class CopyWritingNetworkServiceImpl extends ServiceImpl<CopyWritingNetworkMapper, CopyWritingNetwork> implements CopyWritingNetworkService {
@Resource
private CopyWritingNetworkMapper copyWritingNetworkMapper;
@Override
public int deleteRepeatData() {
return copyWritingNetworkMapper.deleteRepeatData();
}
}

@ -0,0 +1,124 @@
package com.xjs.copywritingNetwork.task;
import com.xjs.common.util.HttpUtils;
import com.xjs.copywritingNetwork.pojo.CopyWritingNetwork;
import com.xjs.copywritingNetwork.service.CopyWritingNetworkService;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
/**
* url:https://www.wenanwang.com/
*
* @author xiejs
* @since 2022-02-16
*/
@Component
@Log4j2
public class CopyWritingNetworkTask {
@Autowired
private HttpUtils httpUtils;
@Autowired
private CopyWritingNetworkService copyWritingNetworkService;
public static final String URL = "https://www.wenanwang.com/";
private static Pattern pattern = Pattern.compile("[0-9]*");
@Scheduled(fixedDelay = 1000 * 5)
public void reptileCopyWriting() {
try {
String html = httpUtils.doGetHtml(URL);
Document document = Jsoup.parse(html);
this.parseHtmlGetUrl(document);
} catch (Exception e) {
e.printStackTrace();
}finally {
int i = copyWritingNetworkService.deleteRepeatData();
log.info("删除文案网数据重复数:"+i);
}
}
/**
* htmlurl
*
* @param document
*/
private void parseHtmlGetUrl(Document document) {
Elements zyzt = document.getElementsByClass("zyzt");
Map<String, String> map = new HashMap<>();
for (Element element : zyzt) {
Elements a = element.select("a");
for (Element elementA : a) {
String text = elementA.text();
String href = elementA.attr("href");
map.put(text, href);
}
}
this.parseHtmlGetCopyWriting(map);
}
/**
* html
*
* @param map url
*/
private void parseHtmlGetCopyWriting(Map<String, String> map) {
ArrayList<CopyWritingNetwork> copyWritingNetworks = new ArrayList<>();
for (Map.Entry<String, String> entry : map.entrySet()) {
String url = entry.getValue();
String html = httpUtils.doGetHtml(url);
Document document = Jsoup.parse(html);
Elements a = document.select(".newslist li h5 a");
for (Element element : a) {
String href = element.attr("href");
String newUrl = URL + href;
String cw = httpUtils.doGetHtml(newUrl);
Document cwDocument = Jsoup.parse(cw);
Elements h1 = cwDocument.select(".newsview > h1");
String theme = null;
for (Element cH1 : h1) {
theme = cH1.text();
}
Elements ps = cwDocument.select(".content > p");
for (Element p : ps) {
CopyWritingNetwork copyWritingNetwork = new CopyWritingNetwork();
copyWritingNetwork.setTheme(theme);
copyWritingNetwork.setContent(p.text());
copyWritingNetwork.setType(entry.getKey());
String content = copyWritingNetwork.getContent();
boolean matches = pattern.matcher(content).matches();
if (StringUtils.isNotEmpty(content) && !matches) {
copyWritingNetworks.add(copyWritingNetwork);
}
}
}
}
copyWritingNetworkService.saveBatch(copyWritingNetworks, 20);
}
}

@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper
PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.xjs.copywritingNetwork.mapper.CopyWritingNetworkMapper">
<delete id="deleteRepeatData">
delete from webmagic_copywriting_network where id not in (
SELECT
t.min_id
FROM
( SELECT min( id ) AS min_id FROM webmagic_copywriting_network GROUP BY content ) AS t
)
</delete>
</mapper>

@ -38,16 +38,11 @@
</delete>
<delete id="deleteRepeatData">
DELETE
FROM
webmagic_sina_news
WHERE
title IN ( SELECT t.title FROM ( SELECT title FROM webmagic_sina_news GROUP BY title HAVING count( title ) > 1 ) t )
AND id NOT IN (
delete from webmagic_sina_news where id not in (
SELECT
c.id
t.min_id
FROM
( SELECT min( id ) id FROM webmagic_sina_news GROUP BY title HAVING count( title )> 1 ) c
( SELECT min( id ) AS min_id FROM webmagic_sina_news GROUP BY title ) AS t
)
</delete>
</mapper>
Loading…
Cancel
Save