核心思想:html
1:從響應頭中讀取 【命中解流準確率最高】app
2:若是響應頭中沒有,打開流從源碼中讀取,【取捨,若是有通常在前30行會有,前100行中尋找】ide
3:若是尚未,根據字節碼code位置,字符識別。【前三個字符揣測】測試
4:最終依舊沒有命中採用,大陸國標編碼【機率接近於0 ,gb2312】ui
綜合效果,尚無測試到編碼有問題的站點。編碼
1 /** 2 * @declare:下載 自動識別編碼 3 * @param url 4 * @return 5 * @author cphmvp 6 */ 7 public static StringBuffer downloadHtmlAutoCode(String url) { 8 StringBuffer sb = new StringBuffer(); 9 BufferedReader bufferReader = null; 10 InputStream inputStream = null; 11 BufferedInputStream bufferedInputStream = null; 12 int tryNum = 0; 13 while (true) { 14 try { 15 if (tryNum > 1) { 16 String ecodingUrl = encodParamters(url); 17 urlModel = new URL(ecodingUrl); 18 } else { 19 urlModel = new URL(url); 20 } 21 httpURLConnection = (HttpURLConnection) urlModel 22 .openConnection(); 23 httpURLConnection.setConnectTimeout(connectTimeout); 24 httpURLConnection.setReadTimeout(readTimeout); 25 // httpURLConnection.setInstanceFollowRedirects(false); 26 // httpURLConnection.setFollowRedirects(true); 27 httpURLConnection 28 .setRequestProperty("User-Agent", 29 "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)"); 30 String redirectUrl = httpURLConnection.getURL().toString(); 31 if (!redirectUrl.equals(url)) { 32 LOG.info(url + "重定向後爲" + redirectUrl); 33 } 34 // 獲得響應流 35 inputStream = httpURLConnection.getInputStream(); 36 if (null == inputStream) 37 continue; 38 String charSetHeader = httpURLConnection 39 .getHeaderField("Content-Type"); 40 bufferedInputStream = new BufferedInputStream(inputStream); 41 String charSet = null; 42 // 第一步先從響應頭header判斷 43 if (charSetHeader != null) { 44 Pattern p = Pattern.compile("charset=[\"']?(.*)['\"]?"); 45 Matcher m = p.matcher(charSetHeader); 46 if (m.find()) { 47 charSet = m.group(1).trim(); 48 } 49 } 50 // System.out.println(bufferedInputStream.available() > 0); 51 // System.out.println(bufferedInputStream.markSupported()); 52 // 第二步 從源碼中【meta http-equiv="content-type" 】判斷 53 // if (null == charSet) { 54 // charSet = getEncode(bufferedInputStream); 55 // System.out.println("---->charSet: 讀流識別出來的編碼" + charSet); 56 // } 57 58 // 排除非html格式 只有一兩行的情況 59 if (null == charSet 60 && charSetHeader.toLowerCase().contains("html")) { 61 // 緩衝區設置大些, read走的信息小於 這個值,就能reset 回來。 62 bufferedInputStream.mark(102400); 63 bufferReader = new BufferedReader(new InputStreamReader( 64 bufferedInputStream)); 65 int lineNum = 1; 66 String inputLine; 67 // reset 在讀至流的末尾是沒法生效,故限制前100行找,找不到 放棄 68 while ((inputLine = bufferReader.readLine()) != null 69 && lineNum < 100) { 70 if (inputLine.toLowerCase().contains("charset")) { 71 charSet = RegexUtils.getString(inputLine, 72 "charset=[\"']?(.*?)[\"']", 1); 73 LOG.info("自動識別出編碼:" + charSet); 74 // 第一次匹配到後 ,再也不往下判斷,減小判斷行數,及誤判機率 75 break; 76 } 77 lineNum++; 78 inputLine = null; 79 } 80 // 第三步奏 穿插補錄步奏 81 if (null == charSet) { 82 byte[] head = new byte[3]; 83 bufferedInputStream.read(head); 84 if (head[0] == -1 && head[1] == -2) 85 charSet = "UTF-16"; 86 if (head[0] == -2 && head[1] == -1) 87 charSet = "Unicode"; 88 if (head[0] == -17 && head[1] == -69 && head[2] == -65) 89 charSet = "UTF-8"; 90 } 91 92 // 通道回溯 93 bufferedInputStream.reset(); 94 } 95 96 // 第四步奏指向默認 utf-8 97 charSet = (charSet == null ? defaultEncoding : charSet); 98 // 第五步奏按照正確編碼解碼響應流 99 bufferReader = new BufferedReader(new InputStreamReader( 100 bufferedInputStream, charSet)); 101 String inputLine; 102 while ((inputLine = bufferReader.readLine()) != null) { 103 sb.append(inputLine + "\n"); 104 inputLine = null; 105 } 106 if (bufferReader != null) 107 try { 108 bufferReader.close(); 109 } catch (IOException e) { 110 LOG.error(e); 111 } 112 if (httpURLConnection != null) 113 httpURLConnection.disconnect(); 114 break; 115 } catch (Exception e) { 116 if (tryNum++ == 3) { 117 LOG.error("download page error [ " + urlModel + " ] "); 118 return null; 119 } 120 LOG.warn(tryNum + "次下載失敗"); 121 } 122 } 123 return sb; 124 125 }