超簡易簡易PHP爬蟲

利用CURL和DOMDocument、經過xpath篩選數據,實現的簡易PHP爬蟲php

<?php
header('Content-type: text/plain; charset=utf-8');

$target_url = "http://www.baidu.com";
$ch = curl_init();

curl_setopt($ch, CURLOPT_URL, $target_url);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);

$html = curl_exec($ch);

if (!$html) {
    echo "<br />cURL error number:" .curl_errno($ch);
    echo "<br />cURL error:" . curl_error($ch);
    exit;
}

//建立一個DomDocument對象,用於處理一個HTML
$dom = new DOMDocument();
//從一個字符串加載HTML
@$dom->loadHTML($html);
//使該HTML規範化
$dom->normalize();

//用DOMXpath加載DOM,用於查詢
$xpath = new DOMXPath($dom);
#獲取全部的a標籤的地址
$hrefs = $xpath->evaluate('//*[@id="u1"]/a');

for ($i = 0; $i < $hrefs->length; $i++) {
    $href = $hrefs->item($i);
    $linktext = $href->nodeValue;
    echo $linktext . PHP_EOL;
}

?>

<hr>
<pre>
<?= $html ?>
</pre>
相關文章
相關標籤/搜索