# 地址java
https://github.com/mofadeyunduo/crawlernode
# 前言git
一、代碼不斷優化更新。github
二、有建議請留言。web
# 介紹apache
一、多線程,基於 ExcutorServcie。服務器
二、使用 Socket 進行 HTTP 請求。多線程
# 優化想法app
一、線程複用,不爲每個網頁單首創建一個線程,每一個 Crawler 負責多個網頁的爬取。dom
二、多個網頁進行一次讀寫,減小 IO 時間(待實現)。
三、多代理,防止請求過多,服務器拒絕響應(待實現)。
# 代碼
SocketCrawler.java:負責爬取網頁。
package per.piers.crawler.service; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import per.piers.crawler.model.HTTPStatus; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import java.io.*; import java.net.Socket; import java.util.*; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created by Piers on 2017/4/15. */ public class SocketCrawler implements Runnable { private Task task; private static Logger logger = LogManager.getLogger(SocketCrawler.class.getName()); private Map<String, String> headers = new LinkedHashMap<>(); private LinkedList<String> websites; private String charset = "utf-8"; private ExecutorService executorService; private String outputPath; public SocketCrawler(LinkedList<String> websites, String outputPath, ExecutorService executorService, Task task) { this(websites, outputPath, null, null, executorService, task); } public SocketCrawler(LinkedList<String> websites, String outputPath, String charset, ExecutorService executorService, Task task) { this(websites, outputPath, charset, null, executorService, task); } public SocketCrawler(LinkedList<String> websites, String outputPath, String charset, Map<String, String> headers, ExecutorService executorService, Task task) { if (websites != null) { this.websites = websites; } else { throw new NullPointerException("websites is null"); } if (executorService != null) { this.executorService = executorService; } else { throw new NullPointerException("executorService is null"); } if (outputPath != null) { this.outputPath = outputPath; new File(outputPath).mkdirs(); } else { throw new NullPointerException("outputPath is null"); } if (task != null) { this.task = task; } else { throw new NullPointerException("task is null"); } if (charset != null) this.charset = charset; logger.debug("Charset: {}", this.charset); if (headers != null) this.headers.putAll(headers); try { DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder(); Document document = documentBuilder.parse(new File("target/classes/defaultHeaders.xml")); NodeList nodeList = document.getElementsByTagName("header"); for (int i = 0; i < nodeList.getLength(); i++) { NamedNodeMap map = nodeList.item(i).getAttributes(); this.headers.put(map.getNamedItem("key").getNodeValue(), map.getNamedItem("value").getNodeValue()); } } catch (ParserConfigurationException | IOException | SAXException e) { e.printStackTrace(); } } public String crawl(String website) throws IOException { synchronized (task) { task.addCount(); logger.info("Count: {}", task.getCount()); } logger.traceEntry(); logger.info("Crawling: {}", website); String[] resolves = resolveWebsite(website); String host = resolves[0], request = resolves[1]; Socket socket = new Socket(host, 80); setOutputStream(socket.getOutputStream(), host, request); try { try (BufferedReader reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), charset))) { String firstLine = reader.readLine(); HTTPStatus status = getStatus(firstLine); if (status == null) { String error = String.format("Unknown HTTP status: %s", website); logger.error(error); throw new IllegalStateException(error); } switch (status) { case NOT_FOUND: logger.warn("404: {}", website); return null; } String line = null; while ((line = reader.readLine()) != null && !line.equals("")) ; StringBuilder builder = new StringBuilder(); while ((line = reader.readLine()) != null) { builder.append(line + "\n"); } logger.info("Crawled: {}", website); return builder.toString(); } } finally { socket.close(); logger.traceExit(); } } private String[] resolveWebsite(String website) { Pattern pattern = Pattern.compile("http://(?<domain>[\\w.]+)(?<request>/.*)?", Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(website); if (!matcher.find()) { String error = String.format("Probably %s is not a valid website", website); logger.error(error); throw new InputMismatchException(error); } String host = matcher.group("domain"); String request = matcher.group("request"); if (request == null) request = "/"; logger.debug("Domain is {}", host); logger.debug("Request is {}", request); return new String[]{host, request}; } private void setOutputStream(OutputStream outputStream, String host, String request) throws IOException { BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, charset)); String firstLine = String.format("GET %s HTTP/1.1", request); logger.debug("HTTP request: {}", firstLine); writer.write(firstLine); writer.newLine(); String hostLine = String.format("Host: %s", host); logger.debug("HTTP request: {}", hostLine); writer.write(hostLine); writer.newLine(); for (String key : headers.keySet()) { String entity = String.format("%s:%s", key, headers.get(key)); logger.debug("HTTP request: {}", entity); writer.write(entity); writer.newLine(); } writer.newLine(); writer.flush(); } private HTTPStatus getStatus(String firstLine) { Matcher matcher = Pattern.compile("HTTP/\\d.\\d (?<HTTPStatus>\\d{3}) \\w+").matcher(firstLine); if (matcher.find()) { switch (Integer.parseInt(matcher.group("HTTPStatus"))) { case 200: return HTTPStatus.OK; case 404: return HTTPStatus.NOT_FOUND; } } return null; } @Override public void run() { // TODO: replace with handler for (String website : websites) { if (!executorService.isShutdown()) { try { String result = crawl(website); if (result != null) { File file = new File(outputPath + "/" + website.replace("http://", "").replaceAll("[/.]", "_")); logger.info("Writing data to {}", file.getAbsolutePath()); if (!file.exists()) file.createNewFile(); try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file)))) { writer.write(result); writer.flush(); } logger.info("Has write {}", file.getAbsolutePath()); } TimeUnit.SECONDS.sleep(new Random().nextInt(task.getTHREAD_SIZE() * 2)); synchronized (task) { if (task.getCount() == task.getTASK_SIZE()) { executorService.shutdown(); } } } catch (IOException e) { logger.error(e.getMessage()); e.printStackTrace(); } catch (InterruptedException e) { // e.printStackTrace(); } } } } }
log4j2.xml
<?xml version="1.0" encoding="UTF-8"?> <!--Configuration後面的status,這個用於設置log4j2自身內部的信息輸出,能夠不設置,當設置成trace時,你會看到log4j2內部各類詳細輸出。--> <!--monitorInterval:Log4j可以自動檢測修改配置 文件和從新配置自己,設置間隔秒數。--> <configuration status="error" monitorInterval="30"> <!--先定義全部的appender--> <appenders> <!--這個輸出控制檯的配置--> <Console name="Console" target="SYSTEM_OUT"> <!--控制檯只輸出level及以上級別的信息(onMatch),其餘的直接拒絕(onMismatch)--> <ThresholdFilter level="debug" onMatch="ACCEPT" onMismatch="DENY"/> <!--這個都知道是輸出日誌的格式--> <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} [%-5level] %class %t %M - %msg%n"/> </Console> <!--文件會打印出全部信息,這個log每次運行程序會自動清空,由append屬性決定,這個也挺有用的,適合臨時測試用--> <File name="log" fileName="log/test.log" append="false"> <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} [%-5level] %class %t %M - %msg%n"/> </File> <!-- 這個會打印出全部的信息,每次大小超過size,則這size大小的日誌會自動存入按年份-月份創建的文件夾下面並進行壓縮,做爲存檔--> <RollingFile name="RollingFile" fileName="logs/app.log" filePattern="log/%d{yyyy-MM}/app-%d{MM-dd-yyyy}-%i.log.gz"> <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} [%-5level] %class %t %M - %msg%n"/> <SizeBasedTriggeringPolicy size="50MB"/> <!-- DefaultRolloverStrategy屬性如不設置,則默認爲最多同一文件夾下7個文件,這裏設置了20 --> <DefaultRolloverStrategy max="20"/> </RollingFile> </appenders> <!--而後定義logger,只有定義了logger並引入的appender,appender纔會生效--> <loggers> <!--創建一個默認的root的logger--> <root level="trace"> <appender-ref ref="RollingFile"/> <appender-ref ref="Console"/> </root> </loggers> </configuration>
defaultHeaders.xml
<?xml version="1.0" encoding="utf-8"?> <headers> <header key="User-Agent" value="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"></header> </headers>
# 遇到的 bug
## 問題
返回的網頁亂碼,設定 UTF - 8 無用。
## 解決
一開始在 Header 裏設置了 Accept-Encoding 屬性。
<header key="Accept-Encoding" value="gzip, deflate, sdch, br"></header>
致使返回的是通過編碼的網頁。刪去便可。
## 遇到的問題
HTTP 請求時,服務器不返回數據。
## 解決
在 HTTP 請求的輸入流,outputStream 最後輸出"\r\n",標明已經發送完畢。