1、開源代碼的問題php
在PHP爬蟲(2)中介紹了開源工程Sunra.PhpSimple.HtmlDomParser。在實際工做中發現一個問題,例如http://www.163.com的網頁數據怎麼也抓取不下來。html
$url = "http://www.163.com"; $content = Http::request($url); $dom = str_get_html($content);//dom返回值爲false
檢查simple_html_dom.php代碼發現,dom
if (empty($str) || strlen($str) > MAX_FILE_SIZE) { $dom->clear(); return false; }
要判斷加載字符串的長度。此處能夠將MAX_FILE_SIZE修改更大一些,或者去除這個判斷。curl
2、字符編碼ui
網頁抓取必然要處理網頁內容,網頁內容的編碼有不少種,常見的UTF-8,GBK,GB2312等。一般處理的過程,首先判斷字符編碼,再轉化成統一編碼。編碼
判斷編碼的代碼,url
function ws_mb_detect_encoding ($string, $enc=null, $ret=null) { static $enclist = array( 'UTF-8', 'GBK', 'GB2312', 'GB18030' ); $result = false; foreach ($enclist as $item) { //$sample = iconv($item, $item, $string); $sample = mb_convert_encoding($string,$item, $item); if (md5($sample) == md5($string)) { if ($ret === NULL) { $result = $item; } else { $result = true; } break; } } return $result; }
轉化成UTF-8編碼spa
$html = mb_convert_encoding($html,"UTF-8",$enc); //enc是ws_mb_detect_encoding返回值
下面的代碼,是從一個導航頁面,抓取所有連接,找到連接文檔的title信息code
<?php Vendor('Sunra.PhpSimple.HtmlDomParser'); $url = "http://hao.360.cn/"; $html = file_get_html($url); $links = $html->find('a'); $num = 0; $array = array(); foreach ($links as $l) { if(strpos($l->href,"http")===0) { $url = $l->href; $pattern = "/(http|https):\/\/\S+?\//";//查找http,https開頭 $ret = preg_match($pattern, $url,$m); $url =$ret?$m[0]:$url; if(!array_search($url, $array)) { $array[] = $url; } if(count($array)>30) { break; } } } foreach ($array as $url) { $html = false; $num = 0; while($html==false && $num<3) { $num++; $html = \Home\Wsn\Http::request($url); } if($html == false) { echo "沒法獲取網頁數據<br>";continue; } $enc = ws_mb_detect_encoding($html); echo $enc."<br>"; if($enc==false) { echo "編碼錯誤<br>";continue; } elseif($enc!='UTF-8') { $html = mb_convert_encoding($html,"UTF-8",$enc); } $dom = str_get_html($html); $title = $dom->find('title',0); if($title){ echo "標題".$title->innertext."<br>"; } else{ echo "沒找到標題<br>"; } echo "<hr>"; } ?>
附錄htm
封裝好的HTTP類以下,喜歡的同窗能夠拿去直接使用。
<?php public static function request($url, $params = array(), $method = 'GET', $multi = false, $extheaders = array()) { if (!function_exists('curl_init')) exit('Need to open the curl extension'); $method = strtoupper($method); $ci = curl_init(); curl_setopt($ci, CURLOPT_USERAGENT, 'PHP-SDK OAuth2.0'); curl_setopt($ci, CURLOPT_CONNECTTIMEOUT, 3); curl_setopt($ci, CURLOPT_TIMEOUT, 3); curl_setopt($ci, CURLOPT_RETURNTRANSFER, true); curl_setopt($ci, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ci, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ci, CURLOPT_HEADER, false); $headers = (array)$extheaders; switch ($method) { case 'POST': curl_setopt($ci, CURLOPT_POST, TRUE); if (!empty($params)) { if ($multi) { foreach ($multi as $key => $file) { $params[$key] = '@' . $file; } curl_setopt($ci, CURLOPT_POSTFIELDS, $params); $headers[] = 'Expect: '; } else { curl_setopt($ci, CURLOPT_POSTFIELDS, http_build_query($params)); } } break; case 'DELETE': $method == 'DELETE' && curl_setopt($ci, CURLOPT_CUSTOMREQUEST, 'DELETE'); break; case 'GET': if (!empty($params)) { $url = $url . (strpos($url, '?') ? '&' : '?') . (is_array($params) ? http_build_query($params) : $params); } break; } curl_setopt($ci, CURLINFO_HEADER_OUT, TRUE); curl_setopt($ci, CURLOPT_URL, $url); if ($headers) { curl_setopt($ci, CURLOPT_HTTPHEADER, $headers); } $response = curl_exec($ci); curl_close($ci); return $response; } ?>