做業代碼
<?php
require 'vendor/autoload.php';
use Medoo\Medoo;
use QL\QueryList;
$ql = new QueryList();
$count = 0;
$database = new medoo([
'database_type' => 'mysql',
'database_name' => 'demo_db',
'server' => 'localhost',
'username' => 'root',
'password' => 'root',
'charset' => 'utf8',
]);
for ($i=1; $i < 99999; $i++) {
$url = "https://www.zhipin.com/c101180100/?query=php&page=".$i;
$url_list = get_url_list($url);
foreach ($url_list as $key => $value) {
get_and_save_data($value['url']);
}
if(is_end($url)){
break;
}
}
function is_end($url){
global $ql;
$class_content = $ql->html(get_html_source($url))->find('a[ka="page-next"]')->class;
if($class_content == "next disabled"){
return true;
}else{
return false;
}
}
function get_url_list($url){
global $ql;
echo $url."\n";
$data = $ql->html(get_html_source($url))->rules([
"url"=>['#main > div > div.job-list > ul > li > div > div.info-primary > h3 > a','href','',function($content){
return "https://www.zhipin.com".$content;
}]
])->queryData();
return $data;
}
function get_and_save_data($url){
global $ql,$database,$count;
$html_source = get_html_source($url);
$data = $ql->html($html_source)->rules([
"company_name"=>['.job-sec>.name','text'],
"job_name"=>['.info-primary>.name>h1','text'],
"info"=>['.job-primary.detail-box>.info-primary>p','html'],
"address"=>['.location-address','text'],
"description"=>['.job-sec:nth-child(1)>.text','text'],
"salary"=>['.salary','text']
])->queryData();
foreach ($data as $key => $value) {
if (!$value['company_name']) {
$value['company_name'] = $ql->html($html_source)->find('a[ka="job-detail-company_custompage"]')->text();
}
$value['url'] = $url;
$value['city'] = explode('<em class="dolt"></em>',$value['info'])[0];
$value['work_year'] = explode('<em class="dolt"></em>',$value['info'])[1];
$value['education'] = explode('<em class="dolt"></em>',$value['info'])[2];
unset($value['info']);
$data[$key] = $value;
}
$database->insert('jobs',$data);
$count++;
$percent = round($count/3);
echo "當前進度: {$percent}%\n";
}
function get_html_source($url){
$result = false;
while (!$result) {
$targetUrl = $url;
$proxyServer = "http://http-dyn.abuyun.com:9020";
$proxyUser = "H19D75L76VK89Q8D";
$proxyPass = "8C17B0A80F475BD8";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $targetUrl);
curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
curl_setopt($ch, CURLOPT_PROXY, $proxyServer);
curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
curl_setopt($ch, CURLOPT_PROXYUSERPWD, "{$proxyUser}:{$proxyPass}");
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;)");
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
if (!$result) {
sleep(2);
}
curl_close($ch);
}
return $result;
}
複製代碼
數據庫中的數據