網絡爬蟲速成指南(一)網頁下載

 
 
註解:此處僅僅是介紹一些類庫及常規使用,若是要詳細瞭解Http協議推薦看下《Http權威指南》
 
 

。net 方向 主要是用到HttpWebRequest下載內容:html

JAVA方向:
主要是用到HttpClient下載內容
示例代碼:
相關類庫(httpclient-4.1.2 httpcore-4.1.4
示例代碼c#:
package com.data.crawl.qa.baiduzhidao;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.UnsupportedEncodingException;import java.util.ArrayList;import java.util.List;import java.util.Map;import java.util.Set;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.http.Header;import org.apache.http.HttpEntity;import org.apache.http.HttpStatus;import org.apache.http.NameValuePair;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.config.CookieSpecs;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.protocol.HttpClientContext;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;import org.apache.http.message.BasicNameValuePair;import org.apache.http.config.Registry;import org.apache.http.config.RegistryBuilder;import org.apache.http.cookie.CookieSpecProvider;import org.apache.http.impl.cookie.BestMatchSpecFactory;import org.apache.http.impl.cookie.BrowserCompatSpecFactory;/** * HttpClient鏈接池 *  * @author wqj *  */public class HttpClientPool {        private static Log log = LogFactory.getLog(HttpClientPool.class);        /**     * 最大HttpClient鏈接數     */    private final int MAX_TOTAL_CONNECTIONS = 10;    /**     * HttpClient鏈接池     */    private PoolingHttpClientConnectionManager connectionManager;    /**     * cookie 上下文     */    protected HttpClientContext context = null;    /**     * default constructor     */    public HttpClientPool(){        connectionManager = new PoolingHttpClientConnectionManager();        /* 鏈接池最大生成鏈接數200 */        connectionManager.setMaxTotal(MAX_TOTAL_CONNECTIONS);        /* 默認設置route最大鏈接數爲20 */        connectionManager.setDefaultMaxPerRoute(10);     // 實例化cookie        context = HttpClientContext.create();        Registry<CookieSpecProvider> registry = RegistryBuilder.<CookieSpecProvider> create()                .register(CookieSpecs.BEST_MATCH, new BestMatchSpecFactory())                .register(CookieSpecs.BROWSER_COMPATIBILITY, new BrowserCompatSpecFactory()).build();        context.setCookieSpecRegistry(registry);    }    /**     * 從線程池實例化HttpClient     *      * @return     */    private CloseableHttpClient getHttpClient() {        int socketTimeOut = 120000;        int connectionTimeOut = 60000;        RequestConfig config = RequestConfig.custom().setSocketTimeout(socketTimeOut)                .setConnectTimeout(connectionTimeOut).setCookieSpec(CookieSpecs.BEST_MATCH).build();        return HttpClients.custom().setDefaultRequestConfig(config).setConnectionManager(connectionManager).build();    }    /**     * Post方式     */    public String Post(String uri, Map<String, String> params) {        CloseableHttpClient httpclient = getHttpClient();        HttpPost httpost = new HttpPost(uri);        List<NameValuePair> post_data = new ArrayList<NameValuePair>();        Set<String> keySet = params.keySet();        for (String key : keySet) {            post_data.add(new BasicNameValuePair(key, params.get(key)));        }        CloseableHttpResponse response = null;        try {            httpost.setEntity(new UrlEncodedFormEntity(post_data, "UTF-8"));            response = httpclient.execute(httpost, context);                        //默認編碼            String charset = "utf-8";                        HttpEntity entity = response.getEntity();            String html = null;            if (entity != null) {                InputStream in = entity.getContent();                /* 偵測編碼 */                ByteArrayOutputStream swapStream = new ByteArrayOutputStream();                byte[] buff = new byte[1024];                int rc = 0;                while ((rc = in.read(buff, 0, 1024)) > 0) {                    swapStream.write(buff, 0, rc);                }                byte[] data = swapStream.toByteArray();                String charset_1 = Icu4jDetector.getEncode(data);                charset = charset_1 == null ? charset : charset_1;                html = new String(data, charset);                System.out.println(html);                in.close();            }            return html;        } catch (UnsupportedEncodingException e) {            log.error(e.getMessage());        } catch (ClientProtocolException e) {            log.error(e.getMessage());        } catch (IOException e) {            log.error(e.getMessage());        }        return null;    }    /**     * 模擬登錄時,訪問首頁時使用此方法,此方法不帶cookie     *      * @param uri 統一資源定位符     * @return html文檔     */    public String downHtml(String uri) {        CloseableHttpClient httpclient = getHttpClient();        HttpGet httpget = new HttpGet(uri);        CloseableHttpResponse response = null;        try {            response = httpclient.execute(httpget);            /* 判斷訪問的狀態碼 */            int statusCode = response.getStatusLine().getStatusCode();            if (statusCode != HttpStatus.SC_OK) {                log.info("request failed: " + response.getStatusLine());                return null;            }            /* 偵測編碼 */            Pattern pattern = Pattern.compile("text/html;[\\s]*charset=(.*)");            Header[] arr = response.getHeaders("Content-Type");            String charset = "utf-8";            if (arr != null) {                String content = arr[0].getValue().toLowerCase();                Matcher m = pattern.matcher(content);                if (m.find()) {                    charset = m.group(1);                }            }            HttpEntity entity = response.getEntity();            String html = null;            if (entity != null) {                InputStream in = entity.getContent();                /* 偵測編碼 */                ByteArrayOutputStream swapStream = new ByteArrayOutputStream();                byte[] buff = new byte[1024];                int rc = 0;                while ((rc = in.read(buff, 0, 1024)) > 0) {                    swapStream.write(buff, 0, rc);                }                byte[] data = swapStream.toByteArray();                String charset_1 = Icu4jDetector.getEncode(data);                charset = charset_1 == null ? charset : charset_1;                html = new String(data, charset);                in.close();            }            return html;        } catch (ClientProtocolException e) {           log.info(e.getMessage());        } catch (IOException e) {            log.info(e.getMessage());        }        return null;    }}
相關文章
相關標籤/搜索