簡介 : HttpClient
是Apache Jakarta Common下的子項目,用於提供高效的,功能豐富的支持HTTP協議的客戶編程工具包,其主要功能以下:html
關於Http請求的方法說明,參考大佬整理的博客:java
http://www.javashuo.com/article/p-bbsrusen-dz.htmlapache
1 JDK1.8編程
2 IntelliJ IDEA瀏覽器
3 IDEA自帶的Maven服務器
建立Maven工程itcast-crawler-first並給pom.xml加入依賴cookie
<dependencies> <!-- HttpClient --> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.3</version> </dependency> <!-- 日誌 --> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.7.25</version> </dependency> </dependencies>
關於日誌的配置文件app
log4j.rootLogger=DEBUG,A1
log4j.logger.cn.itcast = DEBUG
log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n
log4j能夠將日誌以文件的形式輸出,也能夠輸出打印在控制檯上,同時能夠設置輸出的日誌內容顯示格式、日誌文件的生成方式(追加、覆蓋、設置日誌文件大小等等)。我這裏就是直接將日誌打印到控制檯上。org.apache.log4j.ConsoleAppendersocket
編寫最簡單的爬蟲,抓取傳智播客首頁:http://www.itcast.cn/
public class CrawlerFirst { public static void main(String[] args) throws Exception { //1. 打開瀏覽器,建立HttpClient對象 CloseableHttpClient httpClient = HttpClients.createDefault(); //2. 輸入網址,發起get請求建立HttpGet對象 HttpGet httpGet = new HttpGet("http://www.itcast.cn"); //3.按回車,發起請求,返回響應,使用HttpClient對象發起請求 CloseableHttpResponse response = httpClient.execute(httpGet); //4. 解析響應,獲取數據 //判斷狀態碼是不是200 if (response.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = response.getEntity(); String content = EntityUtils.toString(httpEntity, "utf8"); System.out.println(content); } } }
package cn.itcast.crawler.test; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; public class HttpGetTest { public static void main(String[] args) { //建立HttpClient對象 CloseableHttpClient httpClient = HttpClients.createDefault(); //建立HttpGet對象,設置url訪問地址 HttpGet httpGet = new HttpGet("http://www.itcast.cn"); CloseableHttpResponse response = null; try { //使用HttpClient發起請求,獲取response response = httpClient.execute(httpGet); //解析響應 if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf8"); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); }finally { //關閉response try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
public class HttpGetParamTest { public static void main(String[] args) throws Exception { //建立HttpClient對象 CloseableHttpClient httpClient = HttpClients.createDefault(); //設置請求地址是:http://yun.itheima.com/search?keys=Java //建立URIBuilder URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search"); //設置參數 uriBuilder.setParameter("keys","Java"); //建立HttpGet對象,設置url訪問地址 HttpGet httpGet = new HttpGet(uriBuilder.build()); System.out.println("發起請求的信息:"+httpGet); CloseableHttpResponse response = null; try { //使用HttpClient發起請求,獲取response response = httpClient.execute(httpGet); //解析響應 if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf8"); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); }finally { //關閉response try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
public class HttpPostTest { public static void main(String[] args) { //建立HttpClient對象 CloseableHttpClient httpClient = HttpClients.createDefault(); //建立HttpPost對象,設置url訪問地址 HttpPost httpPost = new HttpPost("http://www.itcast.cn"); CloseableHttpResponse response = null; try { //使用HttpClient發起請求,獲取response response = httpClient.execute(httpPost); //解析響應 if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf8"); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); }finally { //關閉response try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
public class HttpPostParamTest { public static void main(String[] args) throws Exception { //建立HttpClient對象 CloseableHttpClient httpClient = HttpClients.createDefault(); //建立HttpPost對象,設置url訪問地址 HttpPost httpPost = new HttpPost("http://yun.itheima.com/search"); //聲明List集合,封裝表單中的參數 List<NameValuePair> params = new ArrayList<NameValuePair>(); //設置請求地址是:http://yun.itheima.com/search?keys=Java params.add(new BasicNameValuePair("keys","Java")); //建立表單的Entity對象,第一個參數就是封裝好的表單數據,第二個參數就是編碼 UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf8"); //設置表單的Entity對象到Post請求中 httpPost.setEntity(formEntity); CloseableHttpResponse response = null; try { //使用HttpClient發起請求,獲取response response = httpClient.execute(httpPost); //解析響應 if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf8"); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); }finally { //關閉response try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
若是每次請求都要建立HttpClient,會有頻繁建立和銷燬的問題,能夠使用鏈接池來解決這個問題。
測試如下代碼,並斷點查看每次獲取的HttpClient都是不同的。
public class HttpClientPoolTest { public static void main(String[] args) { //建立鏈接池管理器 PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager(); //設置最大鏈接數 cm.setMaxTotal(100); //設置每一個主機的最大鏈接數 cm.setDefaultMaxPerRoute(10); //使用鏈接池管理器發起請求 doGet(cm); doGet(cm); } private static void doGet(PoolingHttpClientConnectionManager cm) { //不是每次建立新的HttpClient,而是從鏈接池中獲取HttpClient對象 CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build(); HttpGet httpGet = new HttpGet("http://www.itcast.cn"); CloseableHttpResponse response = null; try { response = httpClient.execute(httpGet); if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf8"); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); }finally { if (response != null) { try { response.close(); } catch (IOException e) { e.printStackTrace(); } //不能關閉HttpClient,由鏈接池管理HttpClient //httpClient.close(); } } } }
在構建網絡爬蟲時,常常須要配置不少信息,例如RequestTimeout(鏈接池獲取到鏈接的超時時間)、ConnectTimeout(創建鏈接的超時)、SocketTimeout(獲取數據的超時時間)、代理、是否容許重定向等信息。
在HttpClient,實現這些配置須要使用到RequestConfig類的一個內部類Builder。
以下爲Builder的源碼以下,代碼太長了,我直接摺疊了。
public static class Builder { private boolean expectContinueEnabled; private HttpHost proxy; private InetAddress localAddress; private boolean staleConnectionCheckEnabled; private String cookieSpec; private boolean redirectsEnabled; private boolean relativeRedirectsAllowed; private boolean circularRedirectsAllowed; private int maxRedirects; private boolean authenticationEnabled; private Collection<String> targetPreferredAuthSchemes; private Collection<String> proxyPreferredAuthSchemes; private int connectionRequestTimeout; private int connectTimeout; private int socketTimeout; private boolean contentCompressionEnabled; Builder() { super(); this.staleConnectionCheckEnabled = false; this.redirectsEnabled = true; this.maxRedirects = 50; this.relativeRedirectsAllowed = true; this.authenticationEnabled = true; this.connectionRequestTimeout = -1; this.connectTimeout = -1; this.socketTimeout = -1; this.contentCompressionEnabled = true; } public Builder setExpectContinueEnabled(final boolean expectContinueEnabled) { this.expectContinueEnabled = expectContinueEnabled; return this; } public Builder setProxy(final HttpHost proxy) { this.proxy = proxy; return this; } public Builder setLocalAddress(final InetAddress localAddress) { this.localAddress = localAddress; return this; } /** * @deprecated (4.4) Use {@link * org.apache.http.impl.conn.PoolingHttpClientConnectionManager#setValidateAfterInactivity(int)} */ @Deprecated public Builder setStaleConnectionCheckEnabled(final boolean staleConnectionCheckEnabled) { this.staleConnectionCheckEnabled = staleConnectionCheckEnabled; return this; } public Builder setCookieSpec(final String cookieSpec) { this.cookieSpec = cookieSpec; return this; } public Builder setRedirectsEnabled(final boolean redirectsEnabled) { this.redirectsEnabled = redirectsEnabled; return this; } public Builder setRelativeRedirectsAllowed(final boolean relativeRedirectsAllowed) { this.relativeRedirectsAllowed = relativeRedirectsAllowed; return this; } public Builder setCircularRedirectsAllowed(final boolean circularRedirectsAllowed) { this.circularRedirectsAllowed = circularRedirectsAllowed; return this; } public Builder setMaxRedirects(final int maxRedirects) { this.maxRedirects = maxRedirects; return this; } public Builder setAuthenticationEnabled(final boolean authenticationEnabled) { this.authenticationEnabled = authenticationEnabled; return this; } public Builder setTargetPreferredAuthSchemes(final Collection<String> targetPreferredAuthSchemes) { this.targetPreferredAuthSchemes = targetPreferredAuthSchemes; return this; } public Builder setProxyPreferredAuthSchemes(final Collection<String> proxyPreferredAuthSchemes) { this.proxyPreferredAuthSchemes = proxyPreferredAuthSchemes; return this; } public Builder setConnectionRequestTimeout(final int connectionRequestTimeout) { this.connectionRequestTimeout = connectionRequestTimeout; return this; } public Builder setConnectTimeout(final int connectTimeout) { this.connectTimeout = connectTimeout; return this; } public Builder setSocketTimeout(final int socketTimeout) { this.socketTimeout = socketTimeout; return this; } /** * @deprecated (4.5) Set {@link #setContentCompressionEnabled(boolean)} to {@code false} and * add the {@code Accept-Encoding} request header. */ @Deprecated public Builder setDecompressionEnabled(final boolean decompressionEnabled) { this.contentCompressionEnabled = decompressionEnabled; return this; } public Builder setContentCompressionEnabled(final boolean contentCompressionEnabled) { this.contentCompressionEnabled = contentCompressionEnabled; return this; } public RequestConfig build() { return new RequestConfig( expectContinueEnabled, proxy, localAddress, staleConnectionCheckEnabled, cookieSpec, redirectsEnabled, relativeRedirectsAllowed, circularRedirectsAllowed, maxRedirects, authenticationEnabled, targetPreferredAuthSchemes, proxyPreferredAuthSchemes, connectionRequestTimeout, connectTimeout, socketTimeout, contentCompressionEnabled); } }
HttpClient中可設置三個超時:RequestTimeout(鏈接池獲取到鏈接的超時時間)、ConnectTimeout(創建鏈接的超時)、SocketTimeout(獲取數據的超時時間)。使用RequestConfig進行配置的示例程序以下:
//所有設置爲10秒 RequestConfig requestConfig = RequestConfig.custom() .setSocketTimeout(10000) .setConnectTimeout(10000) .setConnectionRequestTimeout(10000) .build(); //配置httpClient HttpClient httpClient = HttpClients.custom() .setDefaultRequestConfig(requestConfig) .build();
RequestConfig defaultRequestConfig = RequestConfig.custom() .setProxy(new HttpHost("171.97.67.160", 3128, null)) .build(); //添加代理 HttpClient httpClient = HttpClients.custom(). setDefaultRequestConfig(defaultRequestConfig).build(); //配置httpClient