如今咱們的大概思路有了。就是實現問題了。 這裏我推薦一個java爬取頁面的好工具。[weblink url="https://github.com/code4craft/webmagic"]webmagic[/weblink]
webmagic webmagic是一個開源的Java垂直爬蟲框架,目標是簡化爬蟲的開發流程,讓開發者專一於邏輯功能的開發。webmagic的核心很是簡單,可是覆蓋爬蟲的整個流程,也是很好的學習爬蟲開發的材料。 web爬蟲是一種技術,webmagic致力於將這種技術的實現成本下降,可是出於對資源提供者的尊重,webmagic不會作反封鎖的事情,包括:驗證碼破解、代理切換、自動登陸等。
下面是利用這個工具爬取頁面的代碼:
package com.wbdb.action.baidu;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.http.HttpHost;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ComboExtract;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.MultiPagePipeline;
/**
* @author www.xxku.net<br>
*/
@TargetUrl(value = "http://www.baidu.com/s\\?wd=site%3Awww.xxku.net&pn=\\d+&ie=utf-8")
public class Search implements MultiPageModel {
@ExtractBy(value = "href=\"(http://www\\.baidu\\.com/link\\?url=.*?)\"", type = ExtractBy.Type.Regex)
private List<String> baiduUrl;
@ExtractBy(value = "//p[@id='page']/strong/span[@class='pc']", type = ExtractBy.Type.XPath)
private String pageKey;
@ExtractBy(value = "<span class=\"current\">(\\d+)</span>", type = ExtractBy.Type.Regex)
private String page;
@ComboExtract(value = {
@ExtractBy("//p[@id='page']/a"),
@ExtractBy(value = "http://www.baidu.com/s\\?wd=site%3Awww.xxku.net&pn=\\d+&ie=utf-8", type = ExtractBy.Type.Regex) }, multi = true, notNull = false)
private List<String> otherPage;
@Override
public String getPageKey() {
return pageKey;
}
@Override
public Collection<String> getOtherPages() {
return otherPage;
}
@Override
public String getPage() {
if (page == null) {
return "1";
}
return page;
}
@Override
public MultiPageModel combine(MultiPageModel multiPageModel) {
Search s = new Search();
Search pagedModel1 = (Search) multiPageModel;
this.baiduUrl.addAll(pagedModel1.baiduUrl);
return s;
}
public List<String> getBaiduUrl() {
return baiduUrl;
}
public void setBaiduUrl(List<String> baiduUrl) {
this.baiduUrl = baiduUrl;
}
@Override
public String toString() {
return "Search [baiduUrl=" + baiduUrl + ", pageKey=" + pageKey + ", page=" + page + ", otherPage=" + otherPage
+ "]";
}
public static void main(String[] args) throws IOException {
OOSpider o = OOSpider.create(
Site.me().addStartUrl("http://www.baidu.com/s?wd=site%3Awww.xxku.net&pn=0&ie=utf-8"), Search.class);
o.addPipeline(new MultiPagePipeline());
o.addPipeline(new SearchPipeline());
o.run();
List<String> baiduUrlList = SearchPipeline.getBaiduUrlList();
Search s = new Search();
ArrayList<String> realUrl404 = new ArrayList<String>();
// 獲取404 URl
for (int i = 0; i < baiduUrlList.size(); i++) {
String url404 = s.getRealUrl(baiduUrlList.get(i));
if (url404 != null) {
realUrl404.add(url404);
}
}
// 組件xml
//這裏能夠用dom4j來作比較容易
//我直接把連接打出來。經過在線的sitemap生成器生成了
}
/**
* 獲取真實連接
*
* @param url
* @return
* @throws IOException
* @throws ClientProtocolException
*/
private String getRealUrl(String url) throws IOException {
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet httpget = new HttpGet(url);
HttpContext localContext = new BasicHttpContext();
CloseableHttpResponse response = httpclient.execute(httpget, localContext);
try {
int status = response.getStatusLine().getStatusCode();
if (status == 404) {
HttpHost target = (HttpHost) localContext.getAttribute(ExecutionContext.HTTP_TARGET_HOST);
HttpUriRequest req = (HttpUriRequest) localContext.getAttribute(ExecutionContext.HTTP_REQUEST);
return target.toString() + req.getURI();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
response.close();
}
return null;
}
}