crawler_httpurlconnection_自動編碼識別

時間 2019-11-18

標籤 crawler httpurlconnection 自動編碼識別欄目 HTTP/TCP 简体版

原文原文鏈接

核心思想：html

　　 1：從響應頭中讀取【命中解流準確率最高】app

　　 2：若是響應頭中沒有，打開流從源碼中讀取，【取捨，若是有通常在前30行會有，前100行中尋找】ide

3：若是尚未，根據字節碼code位置，字符識別。【前三個字符揣測】測試

　　 4：最終依舊沒有命中採用，大陸國標編碼【機率接近於0 ,gb2312】ui

綜合效果，尚無測試到編碼有問題的站點。編碼

  1 /**
  2      * @declare:下載 自動識別編碼
  3      * @param url
  4      * @return
  5      * @author cphmvp
  6      */
  7     public static StringBuffer downloadHtmlAutoCode(String url) {
  8         StringBuffer sb = new StringBuffer();
  9         BufferedReader bufferReader = null;
 10         InputStream inputStream = null;
 11         BufferedInputStream bufferedInputStream = null;
 12         int tryNum = 0;
 13         while (true) {
 14             try {
 15                 if (tryNum > 1) {
 16                     String ecodingUrl = encodParamters(url);
 17                     urlModel = new URL(ecodingUrl);
 18                 } else {
 19                     urlModel = new URL(url);
 20                 }
 21                 httpURLConnection = (HttpURLConnection) urlModel
 22                         .openConnection();
 23                 httpURLConnection.setConnectTimeout(connectTimeout);
 24                 httpURLConnection.setReadTimeout(readTimeout);
 25                 // httpURLConnection.setInstanceFollowRedirects(false);
 26                 // httpURLConnection.setFollowRedirects(true);
 27                 httpURLConnection
 28                         .setRequestProperty("User-Agent",
 29                                 "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");
 30                 String redirectUrl = httpURLConnection.getURL().toString();
 31                 if (!redirectUrl.equals(url)) {
 32                     LOG.info(url + "重定向後爲" + redirectUrl);
 33                 }
 34                 // 獲得響應流
 35                 inputStream = httpURLConnection.getInputStream();
 36                 if (null == inputStream)
 37                     continue;
 38                 String charSetHeader = httpURLConnection
 39                         .getHeaderField("Content-Type");
 40                 bufferedInputStream = new BufferedInputStream(inputStream);
 41                 String charSet = null;
 42                 // 第一步先從響應頭header判斷
 43                 if (charSetHeader != null) {
 44                     Pattern p = Pattern.compile("charset=[\"']?(.*)['\"]?");
 45                     Matcher m = p.matcher(charSetHeader);
 46                     if (m.find()) {
 47                         charSet = m.group(1).trim();
 48                     }
 49                 }
 50                 // System.out.println(bufferedInputStream.available() > 0);
 51                 // System.out.println(bufferedInputStream.markSupported());
 52                 // 第二步 從源碼中【meta http-equiv="content-type" 】判斷
 53                 // if (null == charSet) {
 54                 // charSet = getEncode(bufferedInputStream);
 55                 // System.out.println("---->charSet: 讀流識別出來的編碼" + charSet);
 56                 // }
 57 
 58                 // 排除非html格式 只有一兩行的情況
 59                 if (null == charSet
 60                         && charSetHeader.toLowerCase().contains("html")) {
 61                     // 緩衝區設置大些， read走的信息小於 這個值，就能reset 回來。
 62                     bufferedInputStream.mark(102400);
 63                     bufferReader = new BufferedReader(new InputStreamReader(
 64                             bufferedInputStream));
 65                     int lineNum = 1;
 66                     String inputLine;
 67                     // reset 在讀至流的末尾是沒法生效，故限制前100行找，找不到 放棄
 68                     while ((inputLine = bufferReader.readLine()) != null
 69                             && lineNum < 100) {
 70                         if (inputLine.toLowerCase().contains("charset")) {
 71                             charSet = RegexUtils.getString(inputLine,
 72                                     "charset=[\"']?(.*?)[\"']", 1);
 73                             LOG.info("自動識別出編碼：" + charSet);
 74                             // 第一次匹配到後 ，再也不往下判斷，減小判斷行數，及誤判機率
 75                             break;
 76                         }
 77                         lineNum++;
 78                         inputLine = null;
 79                     }
 80                     // 第三步奏 穿插補錄步奏
 81                     if (null == charSet) {
 82                         byte[] head = new byte[3];
 83                         bufferedInputStream.read(head);
 84                         if (head[0] == -1 && head[1] == -2)
 85                             charSet = "UTF-16";
 86                         if (head[0] == -2 && head[1] == -1)
 87                             charSet = "Unicode";
 88                         if (head[0] == -17 && head[1] == -69 && head[2] == -65)
 89                             charSet = "UTF-8";
 90                     }
 91 
 92                     // 通道回溯
 93                     bufferedInputStream.reset();
 94                 }
 95 
 96                 // 第四步奏指向默認 utf-8
 97                 charSet = (charSet == null ? defaultEncoding : charSet);
 98                 // 第五步奏按照正確編碼解碼響應流
 99                 bufferReader = new BufferedReader(new InputStreamReader(
100                         bufferedInputStream, charSet));
101                 String inputLine;
102                 while ((inputLine = bufferReader.readLine()) != null) {
103                     sb.append(inputLine + "\n");
104                     inputLine = null;
105                 }
106                 if (bufferReader != null)
107                     try {
108                         bufferReader.close();
109                     } catch (IOException e) {
110                         LOG.error(e);
111                     }
112                 if (httpURLConnection != null)
113                     httpURLConnection.disconnect();
114                 break;
115             } catch (Exception e) {
116                 if (tryNum++ == 3) {
117                     LOG.error("download page error [ " + urlModel + " ] ");
118                     return null;
119                 }
120                 LOG.warn(tryNum + "次下載失敗");
121             }
122         }
123         return sb;
124 
125     }