原理:html
在storage中維護了3個list,分比爲unusedStorage,usingStorage,removedStorage。java
1. 在程序啓動的時候,會向unusedStorage放入一個seed url;web
2. 程序拿取unusedStorage中的url;redis
3. 將這個url從unusedStorage移除,放到usingStorage中;apache
4. 經過該url,用httpclient獲取網頁源代碼,並保存到磁盤,將該url從usingStorage移除,放進removedStorage中;ide
5. 獲取網頁中的全部超連接(以及其餘有用的信息);工具
6. 將這些超連接放入unusedStorage,放入以前先判斷unusedStorage,usingStorage,removedStorage中是否存在;ui
7. 跳轉到2.this
不足:url
在網頁壓縮格式爲gzip的時候沒作特殊處理,致使中文解碼會有問題,有待改善;
保存文件的時候,因爲是將整個url進行一些操做後當作文件名,故而會出現文件名太長的狀況;
數據沒有作靜態化操做,後期將加入redis做爲數據存儲。
//程序入口
public class Launcher { public static void main(String[] args) throws InterruptedException { String seed = "http://cn.msn.com/"; seed = "http://pp.163.com/square?projectnameforlofter=pp"; //seed = "http://msn.qtmojo.com/main/adfshow?user=MSN|Home_Page|Homepage_Superbanner_1000X90&db=msn"; //seed = "http://www.sina.com.cn/"; //seed = "http://my.oschina.net/u/190049/blog/516634"; Storage<String> urlStorage = new Storage<>(0, seed, false); // int makersCounter = 0; int usersCounter = 60; ProxyInfo pi = new ProxyInfo("web-proxy.sgp.hp.com", 8080); for (int i = 0; i < usersCounter; i++) { Crawler pu = new Crawler(urlStorage,pi); Thread t = new Thread(pu, "Crawler_" + (i + 1)); t.start(); } } }
//url存儲器
package com.nwx.producer_consumer.web_crawler.storage; import java.util.LinkedList; public class Storage<T> { private LinkedList<T> unusedStorage; private LinkedList<T> usingStorage; private LinkedList<T> removedStorage; private Integer capacity; private boolean isDebug; public Storage(LinkedList<T> storage, LinkedList<T> usedStorage, LinkedList<T> removedStorage, Integer capacity, boolean isDebug) { this.unusedStorage = storage; this.usingStorage = usedStorage; this.removedStorage = removedStorage; this.capacity = capacity; this.isDebug = isDebug; } public Storage() { this.unusedStorage = new LinkedList<>(); this.usingStorage = new LinkedList<>(); this.removedStorage = new LinkedList<>(); this.capacity = 100; } public Storage(Integer capacity, T seed, boolean isDebug) { this.unusedStorage = new LinkedList<>(); this.usingStorage = new LinkedList<>(); this.removedStorage = new LinkedList<>(); this.unusedStorage.add(seed); this.capacity = capacity; this.isDebug = isDebug; } /** * Mark a element status to using mode</br> * This element can not be used twice, and if deal this element * successfully, and mark this element to removed mode * * @return */ public synchronized T consume() { T res = null; try { while (unusedStorage.size() <= 0) {// have no elements System.out.printf("[WARNING] The unused storage is empty now, %s was waiting \n", Thread.currentThread().getName()); wait(); } } catch (InterruptedException e) { e.printStackTrace(); } res = unusedStorage.remove(); usingStorage.add(res); notifyAll(); //ThreadUtils.waitAMoment(1000); if (isDebug) { System.out.printf( "[SUCCESS] %s move [%s] to using storage, " + "current sizes(unused,using,removed) are %d, %d, %d \n", Thread.currentThread().getName(), res, unusedStorage.size(), usingStorage.size(), removedStorage.size()); } return res; } public synchronized boolean remove(T o) { try { while (unusedStorage.size() <= 0) { System.out.printf( "[WARNING] The using storage is empty now, can not remove the elememt, %s was waiting \n", Thread.currentThread().getName()); wait(); } } catch (InterruptedException e) { e.printStackTrace(); } usingStorage.remove(o); removedStorage.add(o); if (isDebug) { System.out.printf( "[SUCCESS] %s removed the element %s out of using storage, " + "current sizes(unused,using,removed) are %d, %d, %d \n", Thread.currentThread().getName(), o, unusedStorage.size(), usingStorage.size(), removedStorage.size()); } notifyAll(); return true; } public synchronized void produce(T o) { try { if(capacity > 0){ while (unusedStorage.size() >= capacity) { System.out.printf("[WARNING] The storage is full now, %s was waiting \n", Thread.currentThread().getName()); wait(); } } } catch (InterruptedException e) { e.printStackTrace(); } if(unusedStorage.contains(o)){ if (isDebug) { System.out.printf("[DUPLICATED] The element %s was contained in the unused storage \n", o); } } else if (usingStorage.contains(o)) { if (isDebug) { System.out.printf("[DUPLICATED] The element %s was contained in the using storage \n", o); } } else if (removedStorage.contains(o)) { if (isDebug) { System.out.printf("[DUPLICATED] The element %s was contained in the removed storage \n", o); } } else { unusedStorage.add(o); if (isDebug) { System.out.printf("[SUCCESS] %s put [%s] into unused storage, current size is %d \n", Thread.currentThread().getName(), o, unusedStorage.size()); System.out.printf( "[SUCCESS] %s put [%s] into unused storage, " + "current sizes(unused,using,removed) are %d, %d, %d \n", Thread.currentThread().getName(), o, unusedStorage.size(), usingStorage.size(), removedStorage.size()); } notifyAll(); } } @Override public String toString() { String ss = String.format("Storage info: \nCapacity: %d CurrentSize: %d ", capacity, unusedStorage.size()); return ss; } }
//爬蟲線程
package com.nwx.producer_consumer.web_crawler.makerusers; import java.io.IOException; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.apache.http.client.ClientProtocolException; import com.nwx.producer_consumer.web_crawler.storage.Storage; import com.nwx.producer_consumer.web_crawler.tools.GetterTool; import com.nwx.study.http.ProxyInfo; public class Crawler implements Runnable { private Storage<String> pageUrlstorage; //private Storage<String> sourceUrlstorage; private ProxyInfo pi; public Crawler(Storage<String> pageUrlstorage, ProxyInfo pi) { this.pageUrlstorage = pageUrlstorage; this.pi = pi; } private void getAndHandleElement() throws ClientProtocolException, IOException { String targetUrl = pageUrlstorage.consume(); // System.out.printf("Get %s out of storage \n" , o.getElement()); GetterTool gt = new GetterTool(pi); List<String> hrefs = gt.getHyperlinkUrlsFromPageUrl(targetUrl); // System.out.println("###### Get hrefs size is: " + hrefs.size()); // System.out.printf("%s get and deal the element %s \n", Thread.currentThread().getName(), o); if (null != hrefs) { for (String href : hrefs) { if (StringUtils.isNotEmpty(href) && href.startsWith("http")) { pageUrlstorage.produce(href); } } pageUrlstorage.remove(targetUrl); } } @Override public void run() { while (true) { try { getAndHandleElement(); } catch (Exception e) { continue; } } } }
//url下載工具
package com.nwx.producer_consumer.web_crawler.tools; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang3.StringUtils; import org.apache.http.client.ClientProtocolException; import com.nwx.study.getter.GetterStatus; import com.nwx.study.http.ProxyInfo; import com.nwx.study.http.TagUtils; import com.nwx.study.http.URLParseTool; import com.nwx.study.io.ReadWriteWebFile; import com.nwx.study.io.http.ResponseAndHeads; public class GetterTool { private ProxyInfo pi; public ProxyInfo getPi() { return pi; } public void setPi(ProxyInfo pi) { this.pi = pi; } public GetterTool(ProxyInfo pi) { this.pi = pi; } public List<String> getHyperlinkUrlsFromPageUrl(String targetUrl) throws ClientProtocolException, IOException { if (StringUtils.isNotEmpty(targetUrl) && targetUrl.startsWith("http")) { ResponseAndHeads doc = ReadWriteWebFile.getWebResponseWithEntityUtils(targetUrl, pi); if (null != doc) { String contentTypeStr = doc.getHeaders().get("Content-Type"); //deal with the Content-Encoding:gzip, or will occur the unreadable code String Content_Encoding = doc.getHeaders().get("Content-Encoding");//:gzip System.out.printf("The contentType is %s \n", contentTypeStr); if (StringUtils.isNotEmpty(contentTypeStr) && contentTypeStr.contains("text/html")) { // 網頁文本文件 String fileContent = ""; if (contentTypeStr.contains("charset")) { String charset = contentTypeStr.substring(contentTypeStr.indexOf("=") + 1); fileContent = new String(doc.getResponseContent(), charset); } else { fileContent = new String(doc.getResponseContent()); } String fullFileName = URLParseTool.getPageUrlFullPath(targetUrl); System.out.printf("### The filename is %s \n", fullFileName); ReadWriteWebFile.writeFile(fileContent, GetterStatus.CONTENT_STORE_PATH + "/" + fullFileName); List<String> tags = TagUtils.getTagsByType(fileContent, "a"); List<String> hrefs = TagUtils.getAttr(tags, "href"); List<String> imgTags = TagUtils.getTagsByType(fileContent, "img"); List<String> imgUrls = TagUtils.getAttr(imgTags, "src"); List<String> res = new ArrayList<>(); res.addAll(imgUrls); res.addAll(hrefs); return res; //這裏只是返回了超連接,沒有返回圖片的地址,若是將圖片地址一塊兒返回,那麼下面就能夠處理圖片了。 } else if(StringUtils.isNotEmpty(contentTypeStr) && contentTypeStr.contains("image/") ) {// 圖片等 二進制文件 //image/jpeg String fileType = "image/"; if (StringUtils.isNotEmpty(contentTypeStr) && contentTypeStr.startsWith(fileType)) { if(contentTypeStr.contains(" ")){ contentTypeStr = contentTypeStr.substring(0, contentTypeStr.indexOf(" ")); }else if(contentTypeStr.contains(";")){ contentTypeStr = contentTypeStr.substring(0, contentTypeStr.indexOf(";")); } String suffix = contentTypeStr.substring(contentTypeStr.indexOf(fileType) + fileType.length()); byte[] fileContent = doc.getResponseContent(); String fullFileName = URLParseTool.getSourceUrlFullPath(targetUrl, suffix); ReadWriteWebFile.writeFile(fileContent, GetterStatus.IMAGE_STORE_PATH + "/" + fullFileName); } } } } return null; } public Map<String, List<String>> getUrlsAndSourcesFromPageUrl(String pageUrl) throws ClientProtocolException, IOException { Map<String, List<String>> res = new HashMap<String, List<String>>(); if (StringUtils.isNotEmpty(pageUrl) && pageUrl.startsWith("http")) { ResponseAndHeads resp = ReadWriteWebFile.getWebResponseWithEntityUtils(pageUrl, pi); if (null != resp) { String contentTypeStr = resp.getHeaders().get("Content-Type"); if (StringUtils.isNotEmpty(contentTypeStr) && contentTypeStr.contains("text/html")) { // 網頁文本文件 String fileContent = ""; if (contentTypeStr.contains("charset")) { String charset = contentTypeStr.substring(contentTypeStr.indexOf("=") + 1); fileContent = new String(resp.getResponseContent(), charset); } else { fileContent = new String(resp.getResponseContent()); } String fuleFileName = URLParseTool.getPageUrlFullPath(pageUrl); ReadWriteWebFile.writeFile(fileContent, GetterStatus.CONTENT_STORE_PATH + "/" + fuleFileName); List<String> hyperlinkTags = TagUtils.getTagsByType(fileContent, "a"); List<String> hrefs = TagUtils.getAttr(hyperlinkTags, "href"); res.put("hrefs", hrefs); // <img src="http://img0.tuicool.com/AFV73a.png!web" class="alignCenter"> List<String> imagesTags = TagUtils.getTagsByType(fileContent, "img"); List<String> images = TagUtils.getAttr(imagesTags, "src"); res.put("images", images); } } } return null; } public void getSourceFile(String sourceUrl) throws ClientProtocolException, IOException { if (StringUtils.isNotEmpty(sourceUrl) && sourceUrl.startsWith("http")) { ResponseAndHeads resp = ReadWriteWebFile.getWebResponseWithEntityUtils(sourceUrl, pi); if (null != resp) { String fileType = "image/"; String contentTypeStr = resp.getHeaders().get("Content-Type"); if (StringUtils.isNotEmpty(contentTypeStr) && contentTypeStr.startsWith(fileType)) { if(contentTypeStr.contains(" ")){ contentTypeStr = contentTypeStr.substring(0, contentTypeStr.indexOf(" ")); } String suffix = contentTypeStr.substring(contentTypeStr.indexOf(fileType) + fileType.length()); } } } } }