利用java的HttpURLConnection捕獲網頁信息,同時記錄連接的sessionId。html
考慮到網頁編碼方式的不一樣,自動解析網頁內容,並捕獲其編碼方式,並按該編碼方式讀取網內容。但也存在一些並不是正規網頁,捕獲不到charset,默認iso-8859-1方式讀取網頁內容。java
其中該處記錄sessionId能夠爲了下次再次訪問使用同一個sessionId,欺騙服務器還在同一個會話中,好比登錄驗證之類的...。web
import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; /** * java訪問獲取web頁面信息 * 能夠記錄sessionId供登陸相關使用... * @author Arthur126 * @date 2015-8-21 下午10:00:14 * */ public class CallHttpTest { /** * 網址被訪問記錄的sessionId */ private static String SESSION_ID = ""; public static String callHttp(String callURL) throws Exception { String result = ""; URL u0 = new URL(callURL); HttpURLConnection conn = (HttpURLConnection) u0.openConnection(); conn.setRequestMethod("POST"); conn.setRequestProperty("Content-Type", "text/plain"); conn.setRequestProperty("Content-Language", "en-US"); conn.setConnectTimeout(30000); conn.setReadTimeout(30000); conn.setUseCaches(false); conn.setDoInput(true); conn.setDoOutput(true); if (SESSION_ID != null && !"".equals(SESSION_ID)) { // 已經記錄sessionId則放入session中 conn.setRequestProperty("Cookie", SESSION_ID); } else if ((SESSION_ID == null || "".equals(SESSION_ID)) && conn != null) { // 捕獲sessionId String key = null; for (int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++) { if (key.equalsIgnoreCase("set-cookie")) { SESSION_ID = conn.getHeaderField(key); SESSION_ID = SESSION_ID.substring(0, SESSION_ID.indexOf(";")); break; } } } // 自動捕獲網頁編碼,並按其編碼方式讀取網頁內容 String charset = getChareset(conn.getContentType()); BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream(), charset)); StringBuffer buffer = new StringBuffer(); String line; while ((line = reader.readLine()) != null) { buffer.append(line); } reader.close(); result = buffer.toString(); // 非正規網頁,默認iso-8859-1讀取網頁內容 /*InputStream in = conn.getInputStream(); StringBuffer buffer = new StringBuffer(); int count = 0; while (count != -1) { count = in.read(); if (count != -1) { buffer.append((char) count); } } in.close(); result = new String(buffer.toString().getBytes("iso-8859-1"), "UTF-8");*/ conn.disconnect(); return result; } /** * 獲取網頁編碼方式 * @param contentType * @return */ public static String getChareset(String contentType) { int i = contentType == null ? -1 : contentType.indexOf("charset="); return i == -1 ? "UTF-8" : contentType.substring(i + 8); } public static void main(String[] args) throws Exception { System.out.println(callHttp("https://www.baidu.com/")); System.out.println(SESSION_ID); } }
測試結果:服務器
<html> <head> <meta http-equiv="content-type" content="text/html;charset=utf-8"> <meta http-equiv="X-UA-Compatible" content="IE=Edge"> <title>頁面不存在_百度搜索</title> </head> <body link="#0000cc"> .......略 </body> </html> __bsi=14410402226605058380_00_12_R_N_2_0301_002F_N_I_I_0