用.NET作的網站若是作成POST提交方式,且開了viewstate的話,採集起來有點小繁瑣,在此跟你們分享一下作法。html
採的難點是必須先取得表單裏面的viewstate和datavalidtion兩個字段的值,並模擬POST給服務器,才能取到後面頁面的數據。由於回傳數據比較大,不能用默認的form/url-encode方法傳,要用傳文件的那個表單模式。主要代碼以下:app
1 /** 2 QQ羣:223494678 3 函數:模擬post獲得全部分頁的頁面信息 4 參數: 5 string $EVENTARGUMENT 6 string $VIEWSTATE 7 string $EVENTVALIDATION 8 string $EVENTTARGET 9 返回: 10 string 11 /**/ 12 function getn($EVENTARGUMENT = "", $VIEWSTATE = "", $EVENTVALIDATION = "", $EVENTTARGET = "pager"){ 13 $args = array(); 14 if($EVENTARGUMENT){ 15 $args = array( 16 '__EVENTTARGET'=>$EVENTTARGET, 17 '__EVENTARGUMENT'=>$EVENTARGUMENT, 18 '__VIEWSTATE'=>$VIEWSTATE, 19 '__EVENTVALIDATION'=>$EVENTVALIDATION, 20 '__VIEWSTATEENCRYPTED'=>'', 21 'search$txtFundName='=>'', 22 'search$txtFundManger'=>'', 23 'search$ddlFoundationDateOperater'=>'1', 24 'search$txtFoundationDate'=>'', 25 'search$dltFundType$ctl01$chkFundType'=>'on', 26 'search$dltFundType$ctl01$chklFundChildType$0'=>'on', 27 'search$dltFundType$ctl01$chklFundChildType$1'=>'on', 28 'search$dltFundType$ctl01$chklFundChildType$2'=>'on', 29 'search$dltFundType$ctl01$chklFundChildType$3'=>'on', 30 'search$dltFundType$ctl01$chklFundChildType$4'=>'on', 31 'search$chklFundStatus$0'=>'on', 32 'search$ddlFundOrg'=>'0', 33 'search$txtFundOrgName'=>'', 34 'search$ddlStatisticDateOperater'=>'1', 35 'search$txtStatisticDate'=>'', 36 'search$radlStatisticMode'=>'1' 37 ); 38 } 39 40 $user_agent = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.79 Safari/535.11"; 41 42 $ch = curl_init(); 43 curl_setopt($ch, CURLOPT_URL, 'http://???/default.aspx'); 44 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);// 設爲TRUE讓結果不要直接輸出 45 curl_setopt($ch, CURLOPT_VERBOSE, TRUE); 46 curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE); 47 curl_setopt($ch, CURLOPT_FAILONERROR, TRUE); 48 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE); 49 curl_setopt($ch, CURLOPT_HEADER, TRUE); 50 curl_setopt($ch, CURLINFO_HEADER_OUT, TRUE); 51 52 curl_setopt($ch, CURLOPT_HTTPHEADER, array( 53 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 54 'Accept-Language:zh-CN,zh;q=0.8', 55 'Connection: Keep-Alive', 56 'Cache-Control:max-age=0', 57 'Referer:http://???/default.aspx', 58 'Expect:' 59 )); 60 61 curl_setopt($ch, CURLOPT_POST, true); //啓用POST提交 62 curl_setopt($ch, CURLOPT_POSTFIELDS, $args); //設置POST提交的字符串 63 curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);//HTTP請求User-Agent:頭 64 65 $document = curl_exec($ch); //執行預約義的CURL 66 return $document; 67 } 68 69 /** 70 QQ羣:223494678 71 函數:根據模擬post所得的頁面信息,提取所需post的數據和分頁,最後分解須要的html 72 返回: 73 string 74 /**/ 75 function getHtml(){ 76 global $html; 77 $first = getn(); 78 preg_match('/<font color="black"><b>(\d+?)<\/b><\/font> 頁<\/span>/is', $first, $matches); 79 $total = $matches[1]; 80 preg_match('/<table id="dltData".+?<!-- AspNetPager/is', $first, $matches); 81 $html .= str_replace('<!-- AspNetPager', '', $matches[0]); 82 $VIEWSTATE = ""; 83 $EVENTVALIDATION = ""; 84 preg_match('/<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.+?)"/is', $first, $matches); 85 $VIEWSTATE = $matches[1]; 86 preg_match('/<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.+?)"/is', $first, $matches); 87 $EVENTVALIDATION = $matches[1]; 88 for($i = 2; $i <= $total; $i++){ 89 sleep(1); 90 $EVENTARGUMENT = $i; 91 $result = getn($EVENTARGUMENT, $VIEWSTATE, $EVENTVALIDATION); 92 preg_match('/<table id="dltData".+?<!-- AspNetPager/is', $result, $matches); 93 $html .= str_replace('<!-- AspNetPager', '', $matches[0]); 94 $VIEWSTATE = ""; 95 $EVENTVALIDATION = ""; 96 preg_match('/<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.+?)"/is', $result, $matches); 97 $VIEWSTATE = $matches[1]; 98 preg_match('/<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.+?)"/is', $result, $matches); 99 $EVENTVALIDATION = $matches[1]; 100 } 101 return $html; 102 }
getHtml是採集入口文件,裏面先取每一頁的數據,並提取頁碼等數據,然後循環採集後面的數據,getn是採集函數,主要是CURL模擬POST了curl