Java Socket 爬蟲

時間 2019-11-16

標籤 java socket 爬蟲欄目 Java 简体版

原文原文鏈接

# 地址java

https://github.com/mofadeyunduo/crawlernode

# 前言git

一、代碼不斷優化更新。github

二、有建議請留言。web

# 介紹apache

一、多線程，基於 ExcutorServcie。服務器

二、使用 Socket 進行 HTTP 請求。多線程

# 優化想法app

一、線程複用，不爲每個網頁單首創建一個線程，每一個 Crawler 負責多個網頁的爬取。dom

二、多個網頁進行一次讀寫，減小 IO 時間（待實現）。

三、多代理，防止請求過多，服務器拒絕響應（待實現）。

# 代碼

SocketCrawler.java：負責爬取網頁。

package per.piers.crawler.service;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import per.piers.crawler.model.HTTPStatus;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.*;
import java.net.Socket;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by Piers on 2017/4/15.
 */
public class SocketCrawler implements Runnable {

    private Task task;
    private static Logger logger = LogManager.getLogger(SocketCrawler.class.getName());
    private Map<String, String> headers = new LinkedHashMap<>();
    private LinkedList<String> websites;
    private String charset = "utf-8";
    private ExecutorService executorService;
    private String outputPath;

    public SocketCrawler(LinkedList<String> websites, String outputPath, ExecutorService executorService, Task task) {
        this(websites, outputPath, null, null, executorService, task);
    }

    public SocketCrawler(LinkedList<String> websites, String outputPath, String charset, ExecutorService executorService, Task task) {
        this(websites, outputPath, charset, null, executorService, task);
    }

    public SocketCrawler(LinkedList<String> websites, String outputPath, String charset, Map<String, String> headers, ExecutorService executorService, Task task) {
        if (websites != null) {
            this.websites = websites;
        } else {
            throw new NullPointerException("websites is null");
        }
        if (executorService != null) {
            this.executorService = executorService;
        } else {
            throw new NullPointerException("executorService is null");
        }
        if (outputPath != null) {
            this.outputPath = outputPath;
            new File(outputPath).mkdirs();
        } else {
            throw new NullPointerException("outputPath is null");
        }
        if (task != null) {
            this.task = task;
        } else {
            throw new NullPointerException("task is null");
        }
        if (charset != null) this.charset = charset;
        logger.debug("Charset: {}", this.charset);
        if (headers != null) this.headers.putAll(headers);
        try {
            DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
            Document document = documentBuilder.parse(new File("target/classes/defaultHeaders.xml"));
            NodeList nodeList = document.getElementsByTagName("header");
            for (int i = 0; i < nodeList.getLength(); i++) {
                NamedNodeMap map = nodeList.item(i).getAttributes();
                this.headers.put(map.getNamedItem("key").getNodeValue(), map.getNamedItem("value").getNodeValue());
            }
        } catch (ParserConfigurationException | IOException | SAXException e) {
            e.printStackTrace();
        }
    }

    public String crawl(String website) throws IOException {
        synchronized (task) {
            task.addCount();
            logger.info("Count: {}", task.getCount());
        }
        logger.traceEntry();
        logger.info("Crawling: {}", website);
        String[] resolves = resolveWebsite(website);
        String host = resolves[0], request = resolves[1];
        Socket socket = new Socket(host, 80);
        setOutputStream(socket.getOutputStream(), host, request);
        try {
            try (BufferedReader reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), charset))) {
                String firstLine = reader.readLine();
                HTTPStatus status = getStatus(firstLine);
                if (status == null) {
                    String error = String.format("Unknown HTTP status: %s", website);
                    logger.error(error);
                    throw new IllegalStateException(error);
                }
                switch (status) {
                    case NOT_FOUND:
                        logger.warn("404: {}", website);
                        return null;
                }
                String line = null;
                while ((line = reader.readLine()) != null && !line.equals("")) ;
                StringBuilder builder = new StringBuilder();
                while ((line = reader.readLine()) != null) {
                    builder.append(line + "\n");
                }
                logger.info("Crawled: {}", website);
                return builder.toString();
            }
        } finally {
            socket.close();
            logger.traceExit();
        }
    }

    private String[] resolveWebsite(String website) {
        Pattern pattern = Pattern.compile("http://(?<domain>[\\w.]+)(?<request>/.*)?", Pattern.CASE_INSENSITIVE);
        Matcher matcher = pattern.matcher(website);
        if (!matcher.find()) {
            String error = String.format("Probably %s is not a valid website", website);
            logger.error(error);
            throw new InputMismatchException(error);
        }
        String host = matcher.group("domain");
        String request = matcher.group("request");
        if (request == null) request = "/";
        logger.debug("Domain is {}", host);
        logger.debug("Request is {}", request);
        return new String[]{host, request};
    }

    private void setOutputStream(OutputStream outputStream, String host, String request) throws IOException {
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, charset));
        String firstLine = String.format("GET %s HTTP/1.1", request);
        logger.debug("HTTP request: {}", firstLine);
        writer.write(firstLine);
        writer.newLine();
        String hostLine = String.format("Host: %s", host);
        logger.debug("HTTP request: {}", hostLine);
        writer.write(hostLine);
        writer.newLine();
        for (String key : headers.keySet()) {
            String entity = String.format("%s:%s", key, headers.get(key));
            logger.debug("HTTP request: {}", entity);
            writer.write(entity);
            writer.newLine();
        }
        writer.newLine();
        writer.flush();
    }

    private HTTPStatus getStatus(String firstLine) {
        Matcher matcher = Pattern.compile("HTTP/\\d.\\d (?<HTTPStatus>\\d{3}) \\w+").matcher(firstLine);
        if (matcher.find()) {
            switch (Integer.parseInt(matcher.group("HTTPStatus"))) {
                case 200:
                    return HTTPStatus.OK;
                case 404:
                    return HTTPStatus.NOT_FOUND;
            }
        }
        return null;
    }

    @Override
    public void run() {
        // TODO: replace with handler
        for (String website : websites) {
            if (!executorService.isShutdown()) {
                try {
                    String result = crawl(website);
                    if (result != null) {
                        File file = new File(outputPath + "/" + website.replace("http://", "").replaceAll("[/.]", "_"));
                        logger.info("Writing data to {}", file.getAbsolutePath());
                        if (!file.exists()) file.createNewFile();
                        try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file)))) {
                            writer.write(result);
                            writer.flush();
                        }
                        logger.info("Has write {}", file.getAbsolutePath());
                    }
                    TimeUnit.SECONDS.sleep(new Random().nextInt(task.getTHREAD_SIZE() * 2));
                    synchronized (task) {
                        if (task.getCount() == task.getTASK_SIZE()) {
                            executorService.shutdown();
                        }
                    }
                } catch (IOException e) {
                    logger.error(e.getMessage());
                    e.printStackTrace();
                } catch (InterruptedException e) {
                    // e.printStackTrace();
                }
            }
        }
    }

}

log4j2.xml

<?xml version="1.0" encoding="UTF-8"?>
<!--Configuration後面的status，這個用於設置log4j2自身內部的信息輸出，能夠不設置，當設置成trace時，你會看到log4j2內部各類詳細輸出。-->
<!--monitorInterval：Log4j可以自動檢測修改配置 文件和從新配置自己，設置間隔秒數。-->
<configuration status="error" monitorInterval="30">
    <!--先定義全部的appender-->
    <appenders>
        <!--這個輸出控制檯的配置-->
        <Console name="Console" target="SYSTEM_OUT">
            <!--控制檯只輸出level及以上級別的信息（onMatch），其餘的直接拒絕（onMismatch）-->
            <ThresholdFilter level="debug" onMatch="ACCEPT" onMismatch="DENY"/>
            <!--這個都知道是輸出日誌的格式-->
            <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} [%-5level] %class %t %M - %msg%n"/>
        </Console>
        <!--文件會打印出全部信息，這個log每次運行程序會自動清空，由append屬性決定，這個也挺有用的，適合臨時測試用-->
        <File name="log" fileName="log/test.log" append="false">
            <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} [%-5level] %class %t %M - %msg%n"/>
        </File>
        <!-- 這個會打印出全部的信息，每次大小超過size，則這size大小的日誌會自動存入按年份-月份創建的文件夾下面並進行壓縮，做爲存檔-->
        <RollingFile name="RollingFile" fileName="logs/app.log"
                     filePattern="log/%d{yyyy-MM}/app-%d{MM-dd-yyyy}-%i.log.gz">
            <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} [%-5level] %class %t %M - %msg%n"/>
            <SizeBasedTriggeringPolicy size="50MB"/>
            <!-- DefaultRolloverStrategy屬性如不設置，則默認爲最多同一文件夾下7個文件，這裏設置了20 -->
            <DefaultRolloverStrategy max="20"/>
        </RollingFile>
    </appenders>
    <!--而後定義logger，只有定義了logger並引入的appender，appender纔會生效-->
    <loggers>
        <!--創建一個默認的root的logger-->
        <root level="trace">
            <appender-ref ref="RollingFile"/>
            <appender-ref ref="Console"/>
        </root>
    </loggers>
</configuration>

defaultHeaders.xml

<?xml version="1.0" encoding="utf-8"?>
<headers>
    <header key="User-Agent" value="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"></header>
</headers>

# 遇到的 bug

## 問題

返回的網頁亂碼，設定 UTF - 8 無用。

## 解決

一開始在 Header 裏設置了 Accept-Encoding 屬性。

<header key="Accept-Encoding" value="gzip, deflate, sdch, br"></header>

致使返回的是通過編碼的網頁。刪去便可。

## 遇到的問題

HTTP 請求時，服務器不返回數據。

## 解決

在 HTTP 請求的輸入流，outputStream 最後輸出"\r\n"，標明已經發送完畢。

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。