1 public static string GetHtml() 2 { 3 string url = "http://www.baidu.com"; 4 string Html = string.Empty;//初始化新的webRequst 5 HttpWebRequest Request = (HttpWebRequest)WebRequest.Create(url); 6 Request.Timeout = 300000; 7 Request.ReadWriteTimeout = 300000; 8 // Request.ImpersonationLevel = TokenImpersonationLevel.Anonymous; 9 10 Request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5"); 11 // Request.Headers.Add("Accept-Encoding", "gzip, deflate"); 12 13 Request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; 14 Request.KeepAlive = true; 15 Request.ProtocolVersion = HttpVersion.Version11; 16 Request.Method = "GET"; 17 Request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"; 18 Request.Host = "www.baidu.com"; 19 //Request.Accept = "text/json,*/*;q=0.5"; 20 //Request.Headers.Add("Accept-Charset", "utf-8;q=0.7,*;q=0.7"); 21 //Request.Headers.Add("Accept-Encoding", "gzip, deflate, x-gzip, identity; q=0.9"); 22 Request.UserAgent = @"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"; 23 Request.Referer = url; 24 Request.IfModifiedSince = DateTime.UtcNow; 25 26 HttpWebResponse htmlResponse = (HttpWebResponse)Request.GetResponse(); 27 //從Internet資源返回數據流 28 Stream htmlStream = htmlResponse.GetResponseStream(); 29 // Stream htmlStream = new System.IO.Compression.GZipStream(htmlResponse.GetResponseStream(), System.IO.Compression.CompressionMode.Decompress); 30 //讀取數據流 31 StreamReader weatherStreamReader = new StreamReader(htmlStream, Encoding.GetEncoding("gb2312")); 32 //讀取數據 33 Html = weatherStreamReader.ReadToEnd(); 34 weatherStreamReader.Close(); 35 htmlStream.Close(); 36 htmlResponse.Close(); 37 //針對不一樣的網站查看html源文件 38 return Html; 39 }
再加一段PHP的代碼: 在不修改本頁面utf-8編碼的狀況下如何讓抓取的gb2312頁面不亂碼。php
$headers = array(); $headers[] = 'X-Apple-Tz: 0'; $headers[] = 'X-Apple-Store-Front: 143444,12'; $headers[] = 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'; $headers[] = 'Accept-Encoding: gzip, deflate'; $headers[] = 'Accept-Language: en-US,en;q=0.5'; $headers[] = 'Cache-Control: no-cache'; $headers[] = 'Content-Type: application/x-www-form-urlencoded; charset=gb2312';//utf-8 $headers[] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'; $dat = cUrlGetData($url, $post_fields, $headers);
function cUrlGetData($url, $post_fields = null, $headers = null) { $ch = curl_init(); $timeout = 50000; curl_setopt($ch, CURLOPT_URL, $url); if ($post_fields && !empty($post_fields)) { curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $post_fields); } if ($headers && !empty($headers)) { curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); } curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');//這個是解釋gzip內容................. $data = curl_exec($ch); if (curl_errno($ch)) { echo 'Error:' . curl_error($ch); } curl_close($ch); return $data; } //php腳本開始 /*POST請求遠程內容函數*/ function ppost($url,$data,$ref){ // 模擬提交數據函數 $curl = curl_init(); // 啓動一個CURL會話 curl_setopt($curl, CURLOPT_URL, $url); // 要訪問的地址 curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0); // 對認證證書來源的檢查 curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 1); // 從證書中檢查SSL加密算法是否存在 curl_setopt($curl, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']); // 模擬用戶使用的瀏覽器 curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); // 使用自動跳轉 curl_setopt($curl, CURLOPT_REFERER, $ref); curl_setopt($curl, CURLOPT_POST, 1); // 發送一個常規的Post請求 curl_setopt($curl, CURLOPT_POSTFIELDS, $data); // Post提交的數據包 curl_setopt($curl, CURLOPT_COOKIEFILE,$GLOBALS ['cookie_file']); // 讀取上面所儲存的Cookie信息 curl_setopt($curl, CURLOPT_COOKIEJAR, $GLOBALS['cookie_file']); // 存放Cookie信息的文件名稱 curl_setopt($curl, CURLOPT_HTTPHEADER,array('Accept-Encoding: gzip, deflate')); curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');//這個是解釋gzip內容................. curl_setopt($curl, CURLOPT_TIMEOUT, 30); // 設置超時限制防止死循環 curl_setopt($curl, CURLOPT_HEADER, 0); // 顯示返回的Header區域內容 curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); // 獲取的信息以文件流的形式返回 $tmpInfo = curl_exec($curl); // 執行操做 if (curl_errno($curl)) { echo 'Errno'.curl_error($curl); } curl_close($curl); // 關鍵CURL會話 return $tmpInfo; // 返回數據 }