在作一些須要抓取網頁的項目時,常常性的遇到亂碼問題。最省事的作法是去須要抓取的網站看看具體是什麼編碼,而後採用正確的編碼進行解碼就OK了,不過老是一個個頁面親自去判斷也不是個事兒,尤爲是你須要大量抓取不一樣站點的頁面時,好比網頁爬蟲類的程序,這時咱們須要作一個相對比較通用的程序,進行頁面編碼的正確識別。html
亂碼問題基本上都是編碼不一致致使的,好比網頁編碼使用的是UTF-8,你使用GB2312去讀取,確定會亂碼。知道了本質問題後剩下的就是如何判斷網頁編碼了。GBK、GB23十二、UTF-八、BIG-5,通常來講遇到的中文網頁編碼大可能是這幾種,簡化下就是隻有 GBK和UTF-8兩種,不誇張的說,如今的網站要麼是GBK編碼,要麼是UTF-8編碼,因此接下來的問題就是判斷站點具體是UTF-8的仍是GBK的。java
那怎麼判斷頁面具體編碼呢?首先查看響應頭的 Content-Type,若響應頭裏找不到,再去網頁裏查找meta頭,若仍是找不到,那沒辦法了,設置個默認編碼吧,我的推薦設置成UTF-8。好比訪問博客園首頁http://www.cnblogs.com/,能夠在響應頭裏看到 Content-Type: text/html; charset=utf-8,這樣咱們就知道博客園是採用utf-8編碼,但並非全部的網站都會在響應頭Content-Type加上頁面編碼,好比百度的就是Content-Type: text/html,找不到charset,這時只能去網頁裏面找<meta http-equiv=Content-Type content="text/html;charset=utf-8">,確認網頁最終編碼,總結下就是下面幾步git
經過上面方法基本上能正確解析絕大多數頁面,實在不能識別的只好親自去核實下具體編碼了github
注意:web
下面分別給出Java和C#版的實現代碼,頁面底部給出了源碼的git連接,有須要的童鞋請自行下載網頁爬蟲
package com.cnblogs.lzrabbit.util; import java.io.*; import java.net.*; import java.util.*; import java.util.Map.Entry; import java.util.regex.*; import java.util.zip.*; public class HttpUtil { public static String sendGet(String url) throws Exception { return send(url, "GET", null, null); } public static String sendPost(String url, String param) throws Exception { return send(url, "POST", param, null); } public static String send(String url, String method, String param, Map<String, String> headers) throws Exception { String result = null; HttpURLConnection conn = getConnection(url, method, param, headers); String charset = conn.getHeaderField("Content-Type"); charset = detectCharset(charset); InputStream input = getInputStream(conn); ByteArrayOutputStream output = new ByteArrayOutputStream(); int count; byte[] buffer = new byte[4096]; while ((count = input.read(buffer, 0, buffer.length)) > 0) { output.write(buffer, 0, count); } input.close(); // 若已經過請求頭獲得charset,則不須要去html裏面繼續查找 if (charset == null || charset.equals("")) { charset = detectCharset(output.toString()); // 若在html裏面仍是未找到charset,則設置默認編碼爲utf-8 if (charset == null || charset.equals("")) { charset = "utf-8"; } } result = output.toString(charset); output.close(); // result = output.toString(charset); // BufferedReader bufferReader = new BufferedReader(new // InputStreamReader(input, charset)); // String line; // while ((line = bufferReader.readLine()) != null) { // if (result == null) // bufferReader.mark(1); // result += line; // } // bufferReader.close(); return result; } private static String detectCharset(String input) { Pattern pattern = Pattern.compile("charset=\"?([\\w\\d-]+)\"?;?", Pattern.CASE_INSENSITIVE); if (input != null && !input.equals("")) { Matcher matcher = pattern.matcher(input); if (matcher.find()) { return matcher.group(1); } } return null; } private static InputStream getInputStream(HttpURLConnection conn) throws Exception { String ContentEncoding = conn.getHeaderField("Content-Encoding"); if (ContentEncoding != null) { ContentEncoding = ContentEncoding.toLowerCase(); if (ContentEncoding.indexOf("gzip") != 1) return new GZIPInputStream(conn.getInputStream()); else if (ContentEncoding.indexOf("deflate") != 1) return new DeflaterInputStream(conn.getInputStream()); } return conn.getInputStream(); } static HttpURLConnection getConnection(String url, String method, String param, Map<String, String> header) throws Exception { HttpURLConnection conn = (HttpURLConnection) (new URL(url)).openConnection(); conn.setRequestMethod(method); // 設置通用的請求屬性 conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); conn.setRequestProperty("Connection", "keep-alive"); conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36"); conn.setRequestProperty("Accept-Encoding", "gzip,deflate"); String ContentEncoding = null; if (header != null) { for (Entry<String, String> entry : header.entrySet()) { if (entry.getKey().equalsIgnoreCase("Content-Encoding")) ContentEncoding = entry.getValue(); conn.setRequestProperty(entry.getKey(), entry.getValue()); } } if (method == "POST") { conn.setDoOutput(true); conn.setDoInput(true); if (param != null && !param.equals("")) { OutputStream output = conn.getOutputStream(); if (ContentEncoding != null) { if (ContentEncoding.indexOf("gzip") > 0) { output=new GZIPOutputStream(output); } else if(ContentEncoding.indexOf("deflate") > 0) { output=new DeflaterOutputStream(output); } } output.write(param.getBytes()); } } // 創建實際的鏈接 conn.connect(); return conn; } }
using System; using System.Collections; using System.IO; using System.Linq; using System.Net; using System.Net.Security; using System.Security.Cryptography.X509Certificates; using System.Text; using System.Text.RegularExpressions; using System.Web; using System.IO.Compression; using System.Collections.Generic; using System.Collections.Specialized; namespace CSharp.Util.Net { public class HttpHelper { private static bool RemoteCertificateValidate(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) { //用戶https請求 return true; //老是接受 } public static string SendPost(string url, string data) { return Send(url, "POST", data, null); } public static string SendGet(string url) { return Send(url, "GET", null, null); } public static string Send(string url, string method, string data, HttpConfig config) { if (config == null) config = new HttpConfig(); string result; using (HttpWebResponse response = GetResponse(url, method, data, config)) { Stream stream = response.GetResponseStream(); if (!String.IsNullOrEmpty(response.ContentEncoding)) { if (response.ContentEncoding.Contains("gzip")) { stream = new GZipStream(stream, CompressionMode.Decompress); } else if (response.ContentEncoding.Contains("deflate")) { stream = new DeflateStream(stream, CompressionMode.Decompress); } } byte[] bytes = null; using (MemoryStream ms = new MemoryStream()) { int count; byte[] buffer = new byte[4096]; while ((count = stream.Read(buffer, 0, buffer.Length)) > 0) { ms.Write(buffer, 0, count); } bytes = ms.ToArray(); } #region 檢測流編碼 Encoding encoding; //檢測響應頭是否返回了編碼類型,若返回了編碼類型則使用返回的編碼 //注:有時響應頭沒有編碼類型,CharacterSet常常設置爲ISO-8859-1 if (!string.IsNullOrEmpty(response.CharacterSet) && response.CharacterSet.ToUpper() != "ISO-8859-1") { encoding = Encoding.GetEncoding(response.CharacterSet == "utf8" ? "utf-8" : response.CharacterSet); } else { //若沒有在響應頭找到編碼,則去html找meta頭的charset result = Encoding.Default.GetString(bytes); //在返回的html裏使用正則匹配頁面編碼 Match match = Regex.Match(result, @"<meta.*charset=""?([\w-]+)""?.*>", RegexOptions.IgnoreCase); if (match.Success) { encoding = Encoding.GetEncoding(match.Groups[1].Value); } else { //若html裏面也找不到編碼,默認使用utf-8 encoding = Encoding.GetEncoding(config.CharacterSet); } } #endregion result = encoding.GetString(bytes); } return result; } private static HttpWebResponse GetResponse(string url, string method, string data, HttpConfig config) { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.Method = method; request.Referer = config.Referer; //有些頁面不設置用戶代理信息則會抓取不到內容 request.UserAgent = config.UserAgent; request.Timeout = config.Timeout; request.Accept = config.Accept; request.Headers.Set("Accept-Encoding", config.AcceptEncoding); request.ContentType = config.ContentType; request.KeepAlive = config.KeepAlive; if (url.ToLower().StartsWith("https")) { //這裏加入解決生產環境訪問https的問題--Could not establish trust relationship for the SSL/TLS secure channel ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(RemoteCertificateValidate); } if (method.ToUpper() == "POST") { if (!string.IsNullOrEmpty(data)) { byte[] bytes = Encoding.UTF8.GetBytes(data); if (config.GZipCompress) { using (MemoryStream stream = new MemoryStream()) { using (GZipStream gZipStream = new GZipStream(stream, CompressionMode.Compress)) { gZipStream.Write(bytes, 0, bytes.Length); } bytes = stream.ToArray(); } } request.ContentLength = bytes.Length; request.GetRequestStream().Write(bytes, 0, bytes.Length); } else { request.ContentLength = 0; } } return (HttpWebResponse)request.GetResponse(); } } public class HttpConfig { public string Referer { get; set; } /// <summary> /// 默認(text/html) /// </summary> public string ContentType { get; set; } public string Accept { get; set; } public string AcceptEncoding { get; set; } /// <summary> /// 超時時間(毫秒)默認100000 /// </summary> public int Timeout { get; set; } public string UserAgent { get; set; } /// <summary> /// POST請求時,數據是否進行gzip壓縮 /// </summary> public bool GZipCompress { get; set; } public bool KeepAlive { get; set; } public string CharacterSet { get; set; } public HttpConfig() { this.Timeout = 100000; this.ContentType = "text/html; charset=" + Encoding.UTF8.WebName; this.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36"; this.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; this.AcceptEncoding = "gzip,deflate"; this.GZipCompress = false; this.KeepAlive = true; this.CharacterSet = "UTF-8"; } } }