webmagic是java爬蟲框架中比較簡單易上手的一個。官網連接:http://webmagic.io/html
下面的例子是使用這個框架來爬取工商銀行的私人理財推薦分頁列表數據。頁面連接爲:https://mybank.icbc.com.cn/se...$17$TJ&Area_code=0200&requestChannel=302java
1.引入webmagic:maven配置node
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency>
若是項目已經引入slf4j記錄日誌,則須要在webmagic-extension中排除slf4j-log4j12。以下:web
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency>
2.日誌文件配置以下:apache
log4j.rootLogger=info, ServerDailyRollingFile, stdout
log4j.appender.ServerDailyRollingFile=org.apache.log4j.DailyRollingFileAppender
log4j.appender.ServerDailyRollingFile.DatePattern='.'yyyy-MM-dd
log4j.appender.ServerDailyRollingFile.File=/home/myfile/log/mps.log //日誌文件路徑配置
log4j.appender.ServerDailyRollingFile.layout=org.apache.log4j.PatternLayout
log4j.appender.ServerDailyRollingFile.layout.ConversionPattern=%d - %m%n
log4j.appender.ServerDailyRollingFile.Append=true
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %p [%c] %m%napp
3.爬蟲代碼框架
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.List;
/**
*私行推薦理財
*/
public class PrivateRecommendFinanceSpider implements PageProcessor {
private Logger log = LoggerFactory.getLogger(PrivateRecommendFinanceSpider.class);
// 定義鏈接失敗時,重試機制
private static Site site = Site.me().setRetryTimes(3).setSleepTime(100);
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
log.info("私行推薦理財爬蟲開始====");
String pageNum=page.getHtml().xpath("//*[@id=pageturn]/ul/li[3]/span[2]/b/text()").get();
int pageNumInt=Integer.parseInt(pageNum);
for (int i = 1; i < pageNumInt; i++) {
//獲取下一頁的連接,將當前頁數拼接到url上
String nextUrl="https://mybank.icbc.com.cn/servlet/ICBCBaseReqServletNoSession?dse_operationName=per_FinanceCurProListP3NSOp&p3bank_error_backid=120103&pageFlag=0&menuLabel=10$17$TJ&Area_code=0200&requestChannel=302&nowPageNum_turn="+(i+1);
//將下一頁連接添加到爬蟲隊列中
page.addTargetRequest(nextUrl);
}
List<Selectable> nodes = page.getHtml().xpath("//*[@id=datatableModel]/div").nodes(); //獲取列表的條數
int length=nodes.size();
System.out.println(length);
List<Object> list = new ArrayList<>();
for (int i = 0; i < length-2; i++) {
PrivateRecommend privateRecommend = new PrivateRecommend();
//根據xpath獲取對應節點的內容
privateRecommend.setProductName(page.getHtml().xpath("/html/body/div[1]/div[1]/div[3]/div["+(i+2)+"]/div[2]/div[1]/span[1]/span[1]/a/text()").get());
privateRecommend.setPerformanceBanchmark(page.getHtml().xpath("//*[@id=doublelabel1_"+i+"-content]/text()").get());
privateRecommend.setUpPurchaseAmount(page.getHtml().xpath("//*[@id=doublelabel2_"+i+"-content]/b/text()").get());
privateRecommend.setInvestmentPeriod(page.getHtml().xpath("//*[@id=doublelabel2_"+i+"-content]/b/text()").get());
privateRecommend.setRiskClass(page.getHtml().xpath("//*[@id=tt"+i+"-content]/text()").get());
privateRecommend.setRaisingPeriod(page.getHtml().xpath("/html/body/div[1]/div[1]/div[3]/div["+(i+2)+"]/div[2]/div[1]/span[2]/span/text()").get());
list.add(privateRecommend);
}
//將封裝的list對象傳到pipline中
page.putField("privateRecommend",list);
}
//執行main方法
public void main(String[] args){
Spider.create(new PrivateRecommendFinanceSpider()).addUrl("https://mybank.icbc.com.cn/servlet/ICBCBaseReqServletNoSession?dse_operationName=per_FinanceCurProListP3NSOp&p3bank_error_backid=120103&pageFlag=0&menuLabel=10$17$TJ&Area_code=0200&requestChannel=302")
.addPipeline(new ConsolePipeline()).run();//pipline中保存數據,此例中consolepipeline直接將內容打印到控制檯。可本身定義
log.info("=====私行推薦理財爬蟲執行完畢");
}maven
4.對象的內容ide
@Data
public class PrivateRecommend {url
/**主鍵*/ private Long id; /** * 名稱 */ private String productName; /** *預期年化收益率 */ private String performanceBanchmark; /** *起購金額 */ private String upPurchaseAmount; /** *期限 */ private String investmentPeriod; /** * 風險等級 */ private String riskClass; /** * 最近購買開放日 */ private String raisingPeriod; /** * 更新日期 */ private String updateTime;
}
5.可能存在的坑 a.可能會存在slf4j日誌jar的衝突 b.xpath獲取節點上的內容 c.獲取列表下一頁列表數 d.將下一頁連接添加到爬蟲隊列中