parent
6bc9887cc9
commit
fba7ae528e
@ -0,0 +1,18 @@
|
||||
package com.xjs.copywritingNetwork.mapper;
|
||||
|
||||
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
|
||||
import com.xjs.copywritingNetwork.pojo.CopyWritingNetwork;
|
||||
|
||||
/**
|
||||
* 文案网mapper
|
||||
* @author xiejs
|
||||
* @since 2022-02-16
|
||||
*/
|
||||
public interface CopyWritingNetworkMapper extends BaseMapper<CopyWritingNetwork> {
|
||||
|
||||
/**
|
||||
* 删除重复数据
|
||||
* @return int
|
||||
*/
|
||||
int deleteRepeatData();
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
package com.xjs.copywritingNetwork.pojo;
|
||||
|
||||
import com.baomidou.mybatisplus.annotation.FieldFill;
|
||||
import com.baomidou.mybatisplus.annotation.TableField;
|
||||
import com.baomidou.mybatisplus.annotation.TableName;
|
||||
import lombok.Data;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* 文案网数据实体表
|
||||
* @author xiejs
|
||||
* @since 2022-02-16
|
||||
*/
|
||||
@Data
|
||||
@TableName("webmagic_copywriting_network")
|
||||
public class CopyWritingNetwork implements Serializable {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
private Long id;
|
||||
|
||||
/** 文案标签 */
|
||||
private String type;
|
||||
|
||||
/** 文案主题 */
|
||||
private String theme;
|
||||
|
||||
/** 文案内容 */
|
||||
private String content;
|
||||
|
||||
/** 创建时间 */
|
||||
@TableField(fill = FieldFill.INSERT)
|
||||
private Date createTime;
|
||||
}
|
@ -0,0 +1,17 @@
|
||||
package com.xjs.copywritingNetwork.service;
|
||||
|
||||
import com.baomidou.mybatisplus.extension.service.IService;
|
||||
import com.xjs.copywritingNetwork.pojo.CopyWritingNetwork;
|
||||
|
||||
/**
|
||||
* 文案网service接口
|
||||
* @author xiejs
|
||||
* @since 2022-02-16
|
||||
*/
|
||||
public interface CopyWritingNetworkService extends IService<CopyWritingNetwork> {
|
||||
/**
|
||||
* 删除重复数据
|
||||
* @return int
|
||||
*/
|
||||
int deleteRepeatData();
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
package com.xjs.copywritingNetwork.service.impl;
|
||||
|
||||
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
|
||||
import com.xjs.copywritingNetwork.mapper.CopyWritingNetworkMapper;
|
||||
import com.xjs.copywritingNetwork.pojo.CopyWritingNetwork;
|
||||
import com.xjs.copywritingNetwork.service.CopyWritingNetworkService;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
|
||||
/**
|
||||
* @author xiejs
|
||||
* @since 2022-02-16
|
||||
*/
|
||||
@Service
|
||||
public class CopyWritingNetworkServiceImpl extends ServiceImpl<CopyWritingNetworkMapper, CopyWritingNetwork> implements CopyWritingNetworkService {
|
||||
|
||||
@Resource
|
||||
private CopyWritingNetworkMapper copyWritingNetworkMapper;
|
||||
|
||||
@Override
|
||||
public int deleteRepeatData() {
|
||||
return copyWritingNetworkMapper.deleteRepeatData();
|
||||
}
|
||||
}
|
@ -0,0 +1,124 @@
|
||||
package com.xjs.copywritingNetwork.task;
|
||||
|
||||
import com.xjs.common.util.HttpUtils;
|
||||
import com.xjs.copywritingNetwork.pojo.CopyWritingNetwork;
|
||||
import com.xjs.copywritingNetwork.service.CopyWritingNetworkService;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.scheduling.annotation.Scheduled;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* 文案网爬虫任务 url:https://www.wenanwang.com/
|
||||
*
|
||||
* @author xiejs
|
||||
* @since 2022-02-16
|
||||
*/
|
||||
@Component
|
||||
@Log4j2
|
||||
public class CopyWritingNetworkTask {
|
||||
|
||||
@Autowired
|
||||
private HttpUtils httpUtils;
|
||||
@Autowired
|
||||
private CopyWritingNetworkService copyWritingNetworkService;
|
||||
|
||||
|
||||
public static final String URL = "https://www.wenanwang.com/";
|
||||
|
||||
private static Pattern pattern = Pattern.compile("[0-9]*");
|
||||
|
||||
@Scheduled(fixedDelay = 1000 * 5)
|
||||
public void reptileCopyWriting() {
|
||||
try {
|
||||
String html = httpUtils.doGetHtml(URL);
|
||||
|
||||
Document document = Jsoup.parse(html);
|
||||
|
||||
this.parseHtmlGetUrl(document);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}finally {
|
||||
int i = copyWritingNetworkService.deleteRepeatData();
|
||||
log.info("删除文案网数据重复数:"+i);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析html获取url
|
||||
*
|
||||
* @param document
|
||||
*/
|
||||
private void parseHtmlGetUrl(Document document) {
|
||||
Elements zyzt = document.getElementsByClass("zyzt");
|
||||
|
||||
Map<String, String> map = new HashMap<>();
|
||||
for (Element element : zyzt) {
|
||||
Elements a = element.select("a");
|
||||
for (Element elementA : a) {
|
||||
String text = elementA.text();
|
||||
String href = elementA.attr("href");
|
||||
map.put(text, href);
|
||||
}
|
||||
}
|
||||
|
||||
this.parseHtmlGetCopyWriting(map);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析html获取文案内容并持久化
|
||||
*
|
||||
* @param map 存放了url和名称
|
||||
*/
|
||||
private void parseHtmlGetCopyWriting(Map<String, String> map) {
|
||||
ArrayList<CopyWritingNetwork> copyWritingNetworks = new ArrayList<>();
|
||||
|
||||
for (Map.Entry<String, String> entry : map.entrySet()) {
|
||||
String url = entry.getValue();
|
||||
String html = httpUtils.doGetHtml(url);
|
||||
Document document = Jsoup.parse(html);
|
||||
Elements a = document.select(".newslist li h5 a");
|
||||
for (Element element : a) {
|
||||
|
||||
String href = element.attr("href");
|
||||
String newUrl = URL + href;
|
||||
|
||||
String cw = httpUtils.doGetHtml(newUrl);
|
||||
Document cwDocument = Jsoup.parse(cw);
|
||||
Elements h1 = cwDocument.select(".newsview > h1");
|
||||
String theme = null;
|
||||
for (Element cH1 : h1) {
|
||||
theme = cH1.text();
|
||||
}
|
||||
Elements ps = cwDocument.select(".content > p");
|
||||
for (Element p : ps) {
|
||||
CopyWritingNetwork copyWritingNetwork = new CopyWritingNetwork();
|
||||
copyWritingNetwork.setTheme(theme);
|
||||
copyWritingNetwork.setContent(p.text());
|
||||
copyWritingNetwork.setType(entry.getKey());
|
||||
String content = copyWritingNetwork.getContent();
|
||||
boolean matches = pattern.matcher(content).matches();
|
||||
if (StringUtils.isNotEmpty(content) && !matches) {
|
||||
copyWritingNetworks.add(copyWritingNetwork);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
copyWritingNetworkService.saveBatch(copyWritingNetworks, 20);
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,15 @@
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!DOCTYPE mapper
|
||||
PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
|
||||
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
|
||||
<mapper namespace="com.xjs.copywritingNetwork.mapper.CopyWritingNetworkMapper">
|
||||
|
||||
<delete id="deleteRepeatData">
|
||||
delete from webmagic_copywriting_network where id not in (
|
||||
SELECT
|
||||
t.min_id
|
||||
FROM
|
||||
( SELECT min( id ) AS min_id FROM webmagic_copywriting_network GROUP BY content ) AS t
|
||||
)
|
||||
</delete>
|
||||
</mapper>
|
Loading…
Reference in new issue