註解:此處僅僅是介紹一些類庫及常規使用,若是要詳細瞭解Http協議推薦看下《Http權威指南》
。net 方向 主要是用到HttpWebRequest下載內容:html
JAVA方向:
主要是用到HttpClient下載內容
示例代碼:
相關類庫(httpclient-4.1.2 httpcore-4.1.4)
示例代碼c#:
package com.data.crawl.qa.baiduzhidao;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.UnsupportedEncodingException;import java.util.ArrayList;import java.util.List;import java.util.Map;import java.util.Set;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.http.Header;import org.apache.http.HttpEntity;import org.apache.http.HttpStatus;import org.apache.http.NameValuePair;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.config.CookieSpecs;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.protocol.HttpClientContext;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;import org.apache.http.message.BasicNameValuePair;import org.apache.http.config.Registry;import org.apache.http.config.RegistryBuilder;import org.apache.http.cookie.CookieSpecProvider;import org.apache.http.impl.cookie.BestMatchSpecFactory;import org.apache.http.impl.cookie.BrowserCompatSpecFactory;/** * HttpClient鏈接池 * * @author wqj * */public class HttpClientPool { private static Log log = LogFactory.getLog(HttpClientPool.class); /** * 最大HttpClient鏈接數 */ private final int MAX_TOTAL_CONNECTIONS = 10; /** * HttpClient鏈接池 */ private PoolingHttpClientConnectionManager connectionManager; /** * cookie 上下文 */ protected HttpClientContext context = null; /** * default constructor */ public HttpClientPool(){ connectionManager = new PoolingHttpClientConnectionManager(); /* 鏈接池最大生成鏈接數200 */ connectionManager.setMaxTotal(MAX_TOTAL_CONNECTIONS); /* 默認設置route最大鏈接數爲20 */ connectionManager.setDefaultMaxPerRoute(10); // 實例化cookie context = HttpClientContext.create(); Registry<CookieSpecProvider> registry = RegistryBuilder.<CookieSpecProvider> create() .register(CookieSpecs.BEST_MATCH, new BestMatchSpecFactory()) .register(CookieSpecs.BROWSER_COMPATIBILITY, new BrowserCompatSpecFactory()).build(); context.setCookieSpecRegistry(registry); } /** * 從線程池實例化HttpClient * * @return */ private CloseableHttpClient getHttpClient() { int socketTimeOut = 120000; int connectionTimeOut = 60000; RequestConfig config = RequestConfig.custom().setSocketTimeout(socketTimeOut) .setConnectTimeout(connectionTimeOut).setCookieSpec(CookieSpecs.BEST_MATCH).build(); return HttpClients.custom().setDefaultRequestConfig(config).setConnectionManager(connectionManager).build(); } /** * Post方式 */ public String Post(String uri, Map<String, String> params) { CloseableHttpClient httpclient = getHttpClient(); HttpPost httpost = new HttpPost(uri); List<NameValuePair> post_data = new ArrayList<NameValuePair>(); Set<String> keySet = params.keySet(); for (String key : keySet) { post_data.add(new BasicNameValuePair(key, params.get(key))); } CloseableHttpResponse response = null; try { httpost.setEntity(new UrlEncodedFormEntity(post_data, "UTF-8")); response = httpclient.execute(httpost, context); //默認編碼 String charset = "utf-8"; HttpEntity entity = response.getEntity(); String html = null; if (entity != null) { InputStream in = entity.getContent(); /* 偵測編碼 */ ByteArrayOutputStream swapStream = new ByteArrayOutputStream(); byte[] buff = new byte[1024]; int rc = 0; while ((rc = in.read(buff, 0, 1024)) > 0) { swapStream.write(buff, 0, rc); } byte[] data = swapStream.toByteArray(); String charset_1 = Icu4jDetector.getEncode(data); charset = charset_1 == null ? charset : charset_1; html = new String(data, charset); System.out.println(html); in.close(); } return html; } catch (UnsupportedEncodingException e) { log.error(e.getMessage()); } catch (ClientProtocolException e) { log.error(e.getMessage()); } catch (IOException e) { log.error(e.getMessage()); } return null; } /** * 模擬登錄時,訪問首頁時使用此方法,此方法不帶cookie * * @param uri 統一資源定位符 * @return html文檔 */ public String downHtml(String uri) { CloseableHttpClient httpclient = getHttpClient(); HttpGet httpget = new HttpGet(uri); CloseableHttpResponse response = null; try { response = httpclient.execute(httpget); /* 判斷訪問的狀態碼 */ int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != HttpStatus.SC_OK) { log.info("request failed: " + response.getStatusLine()); return null; } /* 偵測編碼 */ Pattern pattern = Pattern.compile("text/html;[\\s]*charset=(.*)"); Header[] arr = response.getHeaders("Content-Type"); String charset = "utf-8"; if (arr != null) { String content = arr[0].getValue().toLowerCase(); Matcher m = pattern.matcher(content); if (m.find()) { charset = m.group(1); } } HttpEntity entity = response.getEntity(); String html = null; if (entity != null) { InputStream in = entity.getContent(); /* 偵測編碼 */ ByteArrayOutputStream swapStream = new ByteArrayOutputStream(); byte[] buff = new byte[1024]; int rc = 0; while ((rc = in.read(buff, 0, 1024)) > 0) { swapStream.write(buff, 0, rc); } byte[] data = swapStream.toByteArray(); String charset_1 = Icu4jDetector.getEncode(data); charset = charset_1 == null ? charset : charset_1; html = new String(data, charset); in.close(); } return html; } catch (ClientProtocolException e) { log.info(e.getMessage()); } catch (IOException e) { log.info(e.getMessage()); } return null; }}