parent
7b058766b5
commit
491e100071
@ -0,0 +1,38 @@
|
||||
package com.xjs.consts;
|
||||
|
||||
/**
|
||||
* 爬虫常量类
|
||||
* @author xiejs
|
||||
* @since 2022-02-19
|
||||
*/
|
||||
public class ReptileConst {
|
||||
|
||||
//---------------------------url---------------------------------
|
||||
|
||||
/**
|
||||
* 新浪新闻url
|
||||
*/
|
||||
public static final String SINA_NEWS_URL= "https://news.sina.com.cn/";
|
||||
|
||||
/**
|
||||
* 文案网url
|
||||
*/
|
||||
public static final String COPY_WRITING_NETWORK_URL= "https://www.wenanwang.com/";
|
||||
|
||||
|
||||
/**
|
||||
* 51招聘网url
|
||||
*/
|
||||
public static final String _51_JOB_URL= "https://search.51job.com/list/000000,000000,0000,01,9,99,java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
|
||||
|
||||
/**
|
||||
* BOSS直聘网url
|
||||
*/
|
||||
public static final String BOSS_JOB_URL= "https://www.zhipin.com";
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,74 @@
|
||||
package com.xjs._36wallpaper.webmagic;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
/**
|
||||
* 36壁纸网站 爬虫处理
|
||||
*
|
||||
* @author xiejs
|
||||
* @since 2022-02-19
|
||||
*/
|
||||
public class _36wallpaperProcessor implements PageProcessor {
|
||||
|
||||
private static final String URL = "https://www.3gbizhi.com/";
|
||||
|
||||
|
||||
private Site site = Site.me()
|
||||
.setCharset("uft8")//设置字符编码
|
||||
.setTimeOut(10000)//设置超时时间
|
||||
.setRetrySleepTime(2000)//设置重试间隔时间
|
||||
.setCycleRetryTimes(3)//设置重试次数
|
||||
.setSleepTime(10)//设置两个页面之间的间隔时间
|
||||
;
|
||||
|
||||
public void setSite(Site site) {
|
||||
this.site = site;
|
||||
}
|
||||
|
||||
//解析页面
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
//解析返回的数据page,并且把解析的结果放到ResultItems中
|
||||
|
||||
/*//第一种写法:css选择器
|
||||
page.putField("url_css", page.getHtml().css(".tmenu li a").all());
|
||||
|
||||
|
||||
//第二种写法:xpath
|
||||
page.putField("url_xpath",page.getHtml().xpath("//ul[@class=tmenu]/li/a").all());
|
||||
|
||||
|
||||
//第三种写法:正则表达式
|
||||
page.putField("url_regex", page.getHtml().css(".tmenu li a").regex(".*图片*.*").all());*/
|
||||
|
||||
// todo 爬取36壁纸图片
|
||||
|
||||
|
||||
//获取链接
|
||||
page.addTargetRequests(page.getHtml().css(".tmenu li").links().all());
|
||||
page.putField("url", page.getHtml().css(".imgw").links().all());
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return site;
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行爬虫
|
||||
*/
|
||||
public void run() {
|
||||
Spider.create(new _36wallpaperProcessor()).addUrl(URL).thread(5).runAsync();
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new _36wallpaperProcessor()).addUrl("https://www.3gbizhi.com/").thread(5).run();
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
log4j.rootLogger=INFO,A1
|
||||
|
||||
log4j.appender.A1=org.apache.log4j.ConsoleAppender
|
||||
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} {%t} {%c}-{%p} %m%n
|
Loading…
Reference in new issue