php curl 內容採集

時間 2019-11-13
標籤 php curl 內容採集欄目 PHP 简体版
原文原文鏈接
function contentCollection($url){
        $data = array('list'=>null,'status'=>0);
        if(!$url){
            $data['info'] = '請傳入採集地址';
            return $data;
        }
        if(!preg_match("/^http/", $url)){
            $url = 'http://'.$url;
        }
        preg_match("/^http(s)?:\/\/[^\/]+/", $url, $host_ary);
        $start = microtime(true);
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);//將curl_exec()獲取的信息以字符串返回，而不是直接輸出。
        curl_setopt($ch, CURLOPT_FRESH_CONNECT, true);//TRUE 強制獲取一個新的鏈接，而不是緩存中的鏈接。
        //curl_setopt($ch, CURLOPT_FTPAPPEND, true);//爲追加寫入文件，而不是覆蓋。
        curl_setopt($ch, CURLOPT_FAILONERROR, true);//當 HTTP 狀態碼大於等於 400，TRUE 將將顯示錯誤詳情。默認狀況下將返回頁面，忽略 HTTP 代碼。
        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1');//設置UA

        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);//在嘗試鏈接時等待的秒數。設置爲0，則無限等待。
        //設置最大跳轉次數
        $redirects = 5;
        if (!ini_get('open_basedir') && !ini_get('safe_mode')) {
            curl_setopt($ch, CURLOPT_HEADER, false);
            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);//302重定向
            curl_setopt($ch, CURLOPT_MAXREDIRS, $redirects);
            $content = curl_exec($ch);
        } else {
            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
            curl_setopt($ch, CURLOPT_HEADER, true);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
            curl_setopt($ch, CURLOPT_FORBID_REUSE, false);
            do {
                $content = curl_exec($ch);
                if (curl_errno($ch))
                    break;
                $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
                if ($code != 301 && $code != 302)
                    break;
                $header_start = strpos($content, "\r\n")+2;
                $headers = substr($content, $header_start, strpos($content, "\r\n\r\n", $header_start)+2-$header_start);
                if (!preg_match("!\r\n(?:Location|URI): *(.*?) *\r\n!", $headers, $matches))
                    break;
                curl_setopt($ch, CURLOPT_URL, $matches[1]);
            } while (--$redirects);
            if (!$redirects){
                $data['info'] = '重定向次數太多。';
                return $data;
            }
        }
        $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        // 關閉cURL資源，而且釋放系統資源
        curl_close($ch);

        if(200 != $http_code){
            $data['info'] = '採集失敗,http_code:'.$http_code;
            return $data;
        }
        $content = preg_replace("/<\!--(.*?)-->/is", '', $content);

        preg_match_all("/<title[^>]*?>(.*?)<\/title>(.*)<body[^>]*?>(.*?)<\/body>/is", $content, $body,PREG_SET_ORDER);
        if(!$body[0]){
            $data['info'] = '沒有抓取到內容';
            return $data;
        }
        $title = $body[0][1];
        $collection_content = $body[0][3];
        preg_match_all("/<link[^>]*rel=['|\"]stylesheet['|\"][^>]*>/is", $content, $link);
        $link_str = '';
        if($link[0]){
            $link[0] = preg_replace("/(href=['|\"])\//", '${1}'.$host_ary[0].'/', $link[0]);
            $link_str = implode('',$link[0]);
        }
        $collection_content = preg_replace("/(<img[^>]*?src=['|\"])\//", '${1}'.$host_ary[0].'/', $collection_content);
        $collection_content = $link_str . $collection_content;
        $data['status'] = 1;
        $data['info'] = mb_convert_encoding($collection_content, 'utf-8','GBK,UTF-8,ASCII');
        $data['title'] = $title;
        return $data;
    }