思路分析, 爲後面的講解作準備
url的規律
咱們須要爬取的數據
建立數據表
CREATE TABLE `jobs` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`company_name` varchar(255) NOT NULL,
`job_name` varchar(255) NOT NULL,
`work_year` varchar(255) NOT NULL,
`education` varchar(255) NOT NULL,
`address` varchar(255) NOT NULL,
`city` varchar(255) NOT NULL,
`description` text NOT NULL,
`url` varchar(255) NOT NULL,
`salary` varchar(10) NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
複製代碼
參考代碼
<?php
require 'vendor\autoload.php';
use QL\QueryList;
use Medoo\Medoo;
$database = new medoo([
'database_type' => 'mysql',
'database_name' => 'demo_db',
'server' => 'localhost',
'username' => 'root',
'password' => 'root',
'charset' => 'utf8',
]);
$keyword = 'php';
$count = 1;
$city_arr = [
"鄭州"=>'c101180100'
];
$current_city = "";
$ql = new QueryList();
function start(){
global $city_arr,$keyword,$current_city;
foreach ($city_arr as $key => $value) {
$current_city = $key;
$url = "https://www.zhipin.com/{$value}/?query={$keyword}";
$detail_url_list = get_detail_list($url);
foreach ($detail_url_list as $key => $value) {
get_and_save_data($value['detail_url']);
}
}
}
start();
function get_detail_list($url){
global $ql;
$data = [];
for ($i=0; $i < 99; $i++) {
$current_url = $url."&page=".($i+1);
$tmp = $ql->html(get_html_source($current_url))->rules([
'detail_url'=>['#main > div > div.job-list > ul > li > div > div.info-primary > h3 > a','href','',function($a_href){
return "https://www.zhipin.com".$a_href;
}]
])->queryData();
$data = array_merge($data,$tmp);
$next = $ql->html(get_html_source($current_url))->find('a.next[ka="page-next"]')->attr('class');
if ($i == 2) {
break;
}
if($next == "next disabled"){
break;
}
}
return $data;
}
function get_and_save_data($url){
echo $url."\n";
global $ql,$current_city,$count;
$percent = ($count/90)*100;
echo "如今是第{$count}頁, 總共90頁, 當前進度{$percent}%.\n";
$count++;
$html_source = get_html_source($url);
$data = [];
$data['company_name'] = $ql->html($html_source)->find('div.job-sec > div.name')->text();
if(!$data['company_name']){
$data['company_name'] = $ql->html($html_source)->find("a[ka='job-detail-company_custompage']")->text();
}
$data['salary'] = $ql->html($html_source)->find("div.name>span.salary")->text();
$data['job_name'] = $ql->html($html_source)->find('div.info-primary>div.name>h1')->text();
$info = $ql->html($html_source)->find('div.job-primary.detail-box>div.info-primary > p')->html();
$data['work_year'] = explode('<em class="dolt"></em>',$info)[1];
$data['education'] = explode('<em class="dolt"></em>',$info)[2];
$data['address'] = $ql->html($html_source)->find('div.location-address')->text();
$data['city'] = $current_city;
$data['description'] = $ql->html($html_source)->find("div.detail-content>div:nth-child(1).job-sec>div.text")->text();
$data['url'] = $url;
if(!$data['url']){
echo "fuck";
print_r($data);die;
}
save_data($data);
}
function save_data($data){
global $database;
$database->insert('jobs',$data);
}
function get_html_source($url) {
$result = false;
while (!$result) {
$targetUrl = $url;
echo $targetUrl."\n";
$proxyServer = "http://http-dyn.abuyun.com:9020";
$proxyUser = "H19D75L76VK89Q8D";
$proxyPass = "8C17B0A80F475BD8";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $targetUrl);
curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
curl_setopt($ch, CURLOPT_PROXY, $proxyServer);
curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
curl_setopt($ch, CURLOPT_PROXYUSERPWD, "{$proxyUser}:{$proxyPass}");
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;)");
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
curl_close($ch);
}
return $result;
}
複製代碼