PHP爬蟲 -- 020 做業解析 爬取BOSS直聘的招聘信息

做業代碼

<?php

require 'vendor/autoload.php';
use Medoo\Medoo;
use QL\QueryList;

$ql = new QueryList();
// 計數, 用來估算進度
$count = 0;
// 建立數據庫鏈接
$database = new medoo([
    'database_type' => 'mysql',
    'database_name' => 'demo_db',
    'server' => 'localhost',
    'username' => 'root',
    'password' => 'root',
    'charset' => 'utf8',
]);
// for循環, 循環給一個很大的數字, 判斷是否有下一頁, 若是沒有, 則跳出循環
for ($i=1; $i < 99999; $i++) { 
    $url = "https://www.zhipin.com/c101180100/?query=php&page=".$i;
    $url_list = get_url_list($url);
    foreach ($url_list as $key => $value) {
        get_and_save_data($value['url']);
    }
    // 判斷有沒有下一頁
    if(is_end($url)){
        break;
    }

}
/* * @Description: 判斷目標url有沒有下一頁 * @param: 目標url * @return: 沒有下一頁則返回true, 不然false */ 
function is_end($url){
    global $ql;
    $class_content = $ql->html(get_html_source($url))->find('a[ka="page-next"]')->class;
    if($class_content == "next disabled"){
        return true;
    }else{
        return false;
    }
}
/* * @Description: 獲取各個公司, 各個工做,詳情頁的超連接 * @param: 總的分頁url * @return: 超連接列表 */ 
function get_url_list($url){
    global $ql;
    echo $url."\n";
    $data = $ql->html(get_html_source($url))->rules([
        "url"=>['#main > div > div.job-list > ul > li > div > div.info-primary > h3 > a','href','',function($content){
            return "https://www.zhipin.com".$content;
        }]
    ])->queryData();
    return $data;


}
/* * @Description: 從詳情頁獲取相關數據並保存到數據庫 * @param: 詳情頁url * @return: 沒有返回值, 會顯示進度 */ 
function get_and_save_data($url){
    global $ql,$database,$count;
    
    
    $html_source = get_html_source($url);
    $data = $ql->html($html_source)->rules([
        "company_name"=>['.job-sec>.name','text'],
        "job_name"=>['.info-primary>.name>h1','text'],
        "info"=>['.job-primary.detail-box>.info-primary>p','html'],
        "address"=>['.location-address','text'],
        "description"=>['.job-sec:nth-child(1)>.text','text'],
        "salary"=>['.salary','text']
    ])->queryData();
    foreach ($data as $key => $value) {
        // 若是公司名獲取不到, 則從另一個位置獲取
        if (!$value['company_name']) {
            $value['company_name'] = $ql->html($html_source)->find('a[ka="job-detail-company_custompage"]')->text();
        }
        $value['url'] = $url;
        $value['city'] = explode('<em class="dolt"></em>',$value['info'])[0];
        $value['work_year'] = explode('<em class="dolt"></em>',$value['info'])[1];
        $value['education'] = explode('<em class="dolt"></em>',$value['info'])[2];
        unset($value['info']); // 刪除info, 保證能夠寫入數據庫
        $data[$key] = $value;
    }

    
    $database->insert('jobs',$data);
    $count++;
    $percent = round($count/3);
    echo "當前進度: {$percent}%\n";
}

/* * @Description: IP代理, 使用代理獲取html代碼 * @param: 目標url * @return: 目標url的html代碼 */ 
function get_html_source($url){
    $result = false;
    while (!$result) {
        $targetUrl = $url;
        $proxyServer = "http://http-dyn.abuyun.com:9020";
        $proxyUser = "H19D75L76VK89Q8D";
        $proxyPass = "8C17B0A80F475BD8";
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $targetUrl);
        curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
        curl_setopt($ch, CURLOPT_PROXY, $proxyServer);
        curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
        curl_setopt($ch, CURLOPT_PROXYUSERPWD, "{$proxyUser}:{$proxyPass}");
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;)");
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
        curl_setopt($ch, CURLOPT_TIMEOUT, 5);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        $result = curl_exec($ch);
        if (!$result) {
            sleep(2);
        }
        curl_close($ch);
    }
    return $result;
}
複製代碼

數據庫中的數據

相關文章
相關標籤/搜索