function contentCollection($url){ $data = array('list'=>null,'status'=>0); if(!$url){ $data['info'] = '請傳入採集地址'; return $data; } if(!preg_match("/^http/", $url)){ $url = 'http://'.$url; } preg_match("/^http(s)?:\/\/[^\/]+/", $url, $host_ary); $start = microtime(true); $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);//將curl_exec()獲取的信息以字符串返回,而不是直接輸出。 curl_setopt($ch, CURLOPT_FRESH_CONNECT, true);//TRUE 強制獲取一個新的鏈接,而不是緩存中的鏈接。 //curl_setopt($ch, CURLOPT_FTPAPPEND, true);//爲追加寫入文件,而不是覆蓋。 curl_setopt($ch, CURLOPT_FAILONERROR, true);//當 HTTP 狀態碼大於等於 400,TRUE 將將顯示錯誤詳情。默認狀況下將返回頁面,忽略 HTTP 代碼。 curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1');//設置UA curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);//在嘗試鏈接時等待的秒數。設置爲0,則無限等待。 //設置最大跳轉次數 $redirects = 5; if (!ini_get('open_basedir') && !ini_get('safe_mode')) { curl_setopt($ch, CURLOPT_HEADER, false); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);//302重定向 curl_setopt($ch, CURLOPT_MAXREDIRS, $redirects); $content = curl_exec($ch); } else { curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false); curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_FORBID_REUSE, false); do { $content = curl_exec($ch); if (curl_errno($ch)) break; $code = curl_getinfo($ch, CURLINFO_HTTP_CODE); if ($code != 301 && $code != 302) break; $header_start = strpos($content, "\r\n")+2; $headers = substr($content, $header_start, strpos($content, "\r\n\r\n", $header_start)+2-$header_start); if (!preg_match("!\r\n(?:Location|URI): *(.*?) *\r\n!", $headers, $matches)) break; curl_setopt($ch, CURLOPT_URL, $matches[1]); } while (--$redirects); if (!$redirects){ $data['info'] = '重定向次數太多。'; return $data; } } $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE); // 關閉cURL資源,而且釋放系統資源 curl_close($ch); if(200 != $http_code){ $data['info'] = '採集失敗,http_code:'.$http_code; return $data; } $content = preg_replace("/<\!--(.*?)-->/is", '', $content); preg_match_all("/<title[^>]*?>(.*?)<\/title>(.*)<body[^>]*?>(.*?)<\/body>/is", $content, $body,PREG_SET_ORDER); if(!$body[0]){ $data['info'] = '沒有抓取到內容'; return $data; } $title = $body[0][1]; $collection_content = $body[0][3]; preg_match_all("/<link[^>]*rel=['|\"]stylesheet['|\"][^>]*>/is", $content, $link); $link_str = ''; if($link[0]){ $link[0] = preg_replace("/(href=['|\"])\//", '${1}'.$host_ary[0].'/', $link[0]); $link_str = implode('',$link[0]); } $collection_content = preg_replace("/(<img[^>]*?src=['|\"])\//", '${1}'.$host_ary[0].'/', $collection_content); $collection_content = $link_str . $collection_content; $data['status'] = 1; $data['info'] = mb_convert_encoding($collection_content, 'utf-8','GBK,UTF-8,ASCII'); $data['title'] = $title; return $data; }