HttpWebRequest 模擬瀏覽器訪問網站

最近抓網頁時報錯:
要麼返回 The remote server returned an error: (442)
要麼返回: 非法訪問,您的行爲已被WAF系統記錄!
想了想,就當是人家加了抓網頁的東西,因而改了一下方法 加上Request.Header 之類的東西就好了。
具體加什麼,我們能夠先用 fildder 抓一下包就能夠了如:
 
GET http://www.baidu.com/ HTTP/1.1
Host: www.baidu.com
Connection: keep-alive
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
  

  

 1 public static string GetHtml()
 2         {
 3             string url = "http://www.baidu.com";
 4             string Html = string.Empty;//初始化新的webRequst
 5             HttpWebRequest Request = (HttpWebRequest)WebRequest.Create(url);
 6             Request.Timeout = 300000;
 7             Request.ReadWriteTimeout = 300000;
 8          //   Request.ImpersonationLevel = TokenImpersonationLevel.Anonymous;
 9           
10             Request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
11           //  Request.Headers.Add("Accept-Encoding", "gzip, deflate");
12       
13             Request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;           
14             Request.KeepAlive = true;
15             Request.ProtocolVersion = HttpVersion.Version11;
16             Request.Method = "GET";
17             Request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8";
18             Request.Host = "www.baidu.com";
19             //Request.Accept = "text/json,*/*;q=0.5";
20             //Request.Headers.Add("Accept-Charset", "utf-8;q=0.7,*;q=0.7");
21             //Request.Headers.Add("Accept-Encoding", "gzip, deflate, x-gzip, identity; q=0.9");
22             Request.UserAgent = @"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36";
23             Request.Referer = url;
24             Request.IfModifiedSince = DateTime.UtcNow;
25 
26             HttpWebResponse htmlResponse = (HttpWebResponse)Request.GetResponse();
27             //從Internet資源返回數據流
28              Stream htmlStream = htmlResponse.GetResponseStream();
29            // Stream htmlStream = new System.IO.Compression.GZipStream(htmlResponse.GetResponseStream(), System.IO.Compression.CompressionMode.Decompress);
30             //讀取數據流
31             StreamReader weatherStreamReader = new StreamReader(htmlStream, Encoding.GetEncoding("gb2312"));
32             //讀取數據
33             Html = weatherStreamReader.ReadToEnd();
34             weatherStreamReader.Close();
35             htmlStream.Close();
36             htmlResponse.Close();
37             //針對不一樣的網站查看html源文件
38             return Html;
39         }      

 

再加一段PHP的代碼: 在不修改本頁面utf-8編碼的狀況下如何讓抓取的gb2312頁面不亂碼。php

$headers = array();
$headers[] = 'X-Apple-Tz: 0';
$headers[] = 'X-Apple-Store-Front: 143444,12';
$headers[] = 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
$headers[] = 'Accept-Encoding: gzip, deflate';
$headers[] = 'Accept-Language: en-US,en;q=0.5';
$headers[] = 'Cache-Control: no-cache';
$headers[] = 'Content-Type: application/x-www-form-urlencoded; charset=gb2312';//utf-8
$headers[] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36';

$dat = cUrlGetData($url, $post_fields, $headers);
function cUrlGetData($url, $post_fields = null, $headers = null) { $ch = curl_init(); $timeout = 50000; curl_setopt($ch, CURLOPT_URL, $url); if ($post_fields && !empty($post_fields)) { curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $post_fields); } if ($headers && !empty($headers)) { curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); } curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');//這個是解釋gzip內容................. $data = curl_exec($ch); if (curl_errno($ch)) { echo 'Error:' . curl_error($ch); } curl_close($ch); return $data; } //php腳本開始 /*POST請求遠程內容函數*/ function ppost($url,$data,$ref){ // 模擬提交數據函數 $curl = curl_init(); // 啓動一個CURL會話 curl_setopt($curl, CURLOPT_URL, $url); // 要訪問的地址 curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0); // 對認證證書來源的檢查 curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 1); // 從證書中檢查SSL加密算法是否存在 curl_setopt($curl, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']); // 模擬用戶使用的瀏覽器 curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); // 使用自動跳轉 curl_setopt($curl, CURLOPT_REFERER, $ref); curl_setopt($curl, CURLOPT_POST, 1); // 發送一個常規的Post請求 curl_setopt($curl, CURLOPT_POSTFIELDS, $data); // Post提交的數據包 curl_setopt($curl, CURLOPT_COOKIEFILE,$GLOBALS ['cookie_file']); // 讀取上面所儲存的Cookie信息 curl_setopt($curl, CURLOPT_COOKIEJAR, $GLOBALS['cookie_file']); // 存放Cookie信息的文件名稱 curl_setopt($curl, CURLOPT_HTTPHEADER,array('Accept-Encoding: gzip, deflate')); curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');//這個是解釋gzip內容................. curl_setopt($curl, CURLOPT_TIMEOUT, 30); // 設置超時限制防止死循環 curl_setopt($curl, CURLOPT_HEADER, 0); // 顯示返回的Header區域內容 curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); // 獲取的信息以文件流的形式返回 $tmpInfo = curl_exec($curl); // 執行操做 if (curl_errno($curl)) { echo 'Errno'.curl_error($curl); } curl_close($curl); // 關鍵CURL會話 return $tmpInfo; // 返回數據 }
相關文章
相關標籤/搜索