java的一個爬蟲

時間 2019-12-23

標籤 java 一個爬蟲欄目 Java 简体版

原文原文鏈接

進行抓取頁面，我看了一下人家的教程，通常要用到htmlparser用來解析html獲得一個網頁的相關連接，用httpclient抓取網頁數據，下面是一我寫的spider類 package com.openzone.search.spider; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.DataOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.HashSet; import java.util.Set; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.lexer.Stream; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class Spider { /** * 使用種子初始化url隊列 */ String[] seeds;//種子地址，爬蟲首先進入的網頁 String line;//獲取相關的連接，好比line="http://localhost"，爬蟲將只記錄以此爲開頭的地址 String savepath;//存儲網頁的文件夾 String encoding;//爬蟲的編碼形式 public Spider(String[] seeds,String line,String savepath,String encoding){ this.seeds=seeds; this.line=line; this.savepath=savepath; this.encoding=encoding; } public void init(){ Set<String> seedsSet=new HashSet<String>(); for(int i=0;i<seeds.length;i++){ seedsSet.add(seeds[i]); } UrlTables.addToUnvisitedUrlSet(seedsSet); } public void run() throws HttpException, IOException, ParserException{ init(); for(int i=0;i<20;i++){ if(UrlTables.getUnvisitedUrl().size()!=0){ String url=UrlTables.getFirstFromVisitedUrSet(); catchPages(url); UrlTables.addToVisitedUrlSet(url); UrlTables.addToUnvisitedUrlSet(getUrls(url)); } } } public void catchPages(String url){ String filename=null; HttpClient httpClient=new HttpClient(); httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000); GetMethod getMethod=new GetMethod(url); //生成getmthod對象並設置參數 //設置get請求超時5s getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000); //設置請求重試處理 getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler()); //設置encoding getMethod.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET,encoding); getMethod.addRequestHeader("Content-Type", "text/html; charset=UTF-8"); //執行http get請求 int statusCode; try { statusCode = httpClient.executeMethod(getMethod); System.out.print(statusCode); if(statusCode==200){ InputStream responseBody=null; responseBody=getMethod.getResponseBodyAsStream(); filename=getFileNameByUrl(url, getMethod.getResponseHeader("Content-Type").getValue()); if(responseBody!=null) saveToLocal(responseBody,filename); System.out.println("getsuccess"); String body=""; body=responseBody.toString(); System.out.println(body); }else{ System.out.print("getfalse"); } } catch (HttpException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /* * 將catchPages獲得的網頁的比特流存到本地 */ public void saveToLocal(InputStream responseBody,String filename) throws IOException{ BufferedReader reader = new BufferedReader(new InputStreamReader(responseBody,encoding)); File file=new File(savepath,filename); FileOutputStream fileOutputStream=new FileOutputStream(file); OutputStreamWriter writer=new OutputStreamWriter(fileOutputStream); String line; while((line=reader.readLine())!=null){ System.out.println(line); writer.write(line); } writer.flush(); writer.close(); } /* * 解析頁面的url */ public Set<String> getUrls(String url) throws ParserException{ Set<String> links=new HashSet<String>(); Parser parser=new Parser(url); parser.setEncoding(encoding); NodeFilter frameFilter=new NodeFilter() { @Override public boolean accept(Node node) { // TODO Auto-generated method stub if(node.getText().startsWith("frame src=")){ return true; }else{ return false; } } }; OrFilter linkFilter=new OrFilter(new NodeClassFilter(LinkTag.class),frameFilter); NodeList list=parser.extractAllNodesThatMatch(linkFilter); for(int i=0;i<list.size();i++){ Node tag=list.elementAt(i); if(tag instanceof LinkTag){ LinkTag link=(LinkTag)tag; String linkUrl=link.getLink(); if(frameFilter.accept(tag)){ //處理<frame> String frameTxt=tag.getText(); int start=frameTxt.indexOf("src="); frameTxt=frameTxt.substring(start); int end=frameTxt.indexOf(" "); if(end==-1){ end=frameTxt.indexOf(">"); } String frameUrl=frameTxt.substring(5,end-1); if(LinkFilter(frameUrl)) links.add(frameUrl); }else{ //處理<a> if(LinkFilter(linkUrl)){ links.add(linkUrl); } } } } return links; } //爬蟲遵循的線索 public boolean LinkFilter(String url){ if(url.startsWith(line)){ return true; }else{ return false; } } //網頁名filter，否則會出現存儲錯誤 public String getFileNameByUrl(String url,String contentType){ //移除http; url=url.substring(7); //text/html類型 if(contentType.indexOf("html")!=-1){ url=url.replaceAll("[\\?/:*|<>\"]", "_")+".html"; return url; }else{ return url.replaceAll("[\\?/:*|<>\"]","_")+"."+ contentType.substring(contentType.lastIndexOf("/")+1); } } } //下面是存儲地址的類 package com.openzone.search.spider; import java.util.HashSet; import java.util.LinkedList; import java.util.Set; public class UrlTables { private static Set<String> visitedUrlSet=new HashSet(); private static LinkedList unvisitedUrlSet=new LinkedList(); public static Set getVisitedUrl() { return visitedUrlSet; } public static void setVisitedUrl(Set visitedUrl) { UrlTables.visitedUrlSet = visitedUrl; } public static LinkedList getUnvisitedUrl() { return unvisitedUrlSet; } public static void setUnvisitedUrl(LinkedList unvisitedUrl) { UrlTables.unvisitedUrlSet = unvisitedUrl; } public static void addToVisitedUrlSet(String url){ visitedUrlSet.add(url); } public static boolean IsUnvisitedUrlSetEmpty(){ boolean isEmpty=false; if(unvisitedUrlSet.isEmpty()){ isEmpty=true; } return isEmpty; } public static void addToUnvisitedUrlSet(Set<String> urls){ for (String url : urls) { if(!isVisited(url)){ unvisitedUrlSet.add(url); } } } public static boolean isVisited(String url){ boolean isVisited=false; for (String visitedUrl : visitedUrlSet) { if(visitedUrl.equals(url)){ isVisited=true; } } return isVisited; } public static String getFirstFromVisitedUrSet(){ String url=unvisitedUrlSet.getFirst().toString(); unvisitedUrlSet.removeFirst(); return url; } } //下面實例化爬蟲進行工做 package com.openzone.search.spider; import java.io.IOException; import org.apache.commons.httpclient.HttpException; import org.htmlparser.util.ParserException; public class SpiderRun { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub String[] seeds={"http://localhost/openzone/"}; String line="http://localhost"; String savepath="D:\\javaworkspace\\openzone"; String encoding="utf-8"; Spider spider=new Spider(seeds, line, savepath, encoding); try { spider.run(); } catch (HttpException e) { e.printStackTrace(); // TODO Auto-generated catch block } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }