最近抓的2個網站內容的代碼php
列表頁抓取:第一種使用phpquery插件,能夠快速獲取,第二種它是api,因此直接獲取html
load_third("phpQuery.php"); /*********www.sosobtc.com***********/ /**/ $re = phpQuery::newDocumentFile('https://www.sosobtc.com/news/all'); //設置好抓取的新聞列表網址 $data = array(); // 獲取列表地址 foreach(pq('.news-list .news-thumbnail a') as $key=>$value) { $href = $value->getAttribute('href'); $data[$key]['source_url'] = "https://www.sosobtc.com".$href; } // 獲取標題 foreach(pq('.news-list .news-title h3') as $key=>$value) { $title = pq($value)->text(); $data[$key]['title'] = $title; } // 獲取封面圖地址 foreach(pq('.news-list .share-box ul') as $key=>$value) { $re = pq($value)->find('li')->eq(0)->find('a')->attr('href'); $str = strrchr($re,"&"); $arr= explode("=",$str); $data[$key]['pic'] = $arr[1]; $str2 = explode("/",$arr[1]); $data[$key]['add_time'] = strtotime($str2[5]); } //獲取信息初始來源 foreach(pq('.category') as $key=>$value) { $source = pq($value)->text(); $data[$key]['source'] = $source; } // exit; foreach($data as $v){ $adddata['title'] = $v['title']; $adddata['source_url'] = $v['source_url']; $adddata['add_time'] = time(); $adddata['add_time'] = $v['add_time']; $adddata['pic'] = $v['pic']; $adddata['source'] = $v['source']; // $adddata['stype'] = 1; $result = News::add($adddata); if(!$result['insert_id']){ file_put_contents("/data/log/fail_spider.log",var_dump($result).",".$v['source_url'].",".$v['pic']."\r\n",FILE_APPEND); } } /*********www.sosobtc.com***********/ /*********www.36kr.com/***********/ $result = file_get_contents("http://36kr.com/api/search-column/208?per_page=20&page=1"); if(!$result){ die; } $result = json_decode($result,true); if(count($result['data']['items'])==0){ die; } foreach($result['data']['items'] as $k=>$v){ $sdata['add_time'] = strtotime($v['published_at']); $sdata['title'] = $v['title']; $sdata['pic'] = $v['template_info']['template_cover'][0]; $info = json_decode($v['user_info'],true); $sdata['source'] = $info['name']; $sdata['source_url'] = "http://36kr.com/p/".$v['id'].".html"; $re = News::add($sdata); if(!$re['insert_id']){ file_put_contents("/data/log/fail_spider.log",var_dump($re).",".$v['source_url'].",".$v['pic']."\r\n",FILE_APPEND); } } /*********www.36kr.com/***********/
先獲取的列表內容,再根據列表對應的目標地址,再去挨個抓取詳情,mysql
詳情頁面抓取:sql
load_third("phpQuery.php"); function download($url) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30); $file = curl_exec($ch); curl_close($ch); $filename = pathinfo($url, PATHINFO_BASENAME); $path = '/data/xxxxx.com/phone/wwwroot/upimg/';//**************注意權限問題 $dirarr = explode("/",$url); $path .= $dirarr[5]."/"; if (!is_dir($path)) mkdir($path); $resource = fopen($path . $filename, 'a'); fwrite($resource, $file); fclose($resource); return "/".$dirarr[5]."/".$filename; } function download2($url) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30); $file = curl_exec($ch); curl_close($ch); $filename = pathinfo($url, PATHINFO_BASENAME).".jpg"; $path = '/data/xxxxx.com/phone/wwwroot/upimg/';//**************注意權限問題 $path .= date("Ymd")."/"; if (!is_dir($path)) mkdir($path); $resource = fopen($path . $filename, 'a'); fwrite($resource, $file); fclose($resource); return "/".date("Ymd")."/".$filename; } $result = News::getdown(); if(count($result)==0){ exit(2); } foreach($result as $v) { if(strpos($v['source_url'],'sosobtc')){ $path = download($v['pic']);//下載圖片到本地 $re = phpQuery::newDocumentFile($v['source_url']); //設置好抓取的新聞列表網址 $content = pq(".article-main")->html(); // $id = $v['id']; $data['pic'] = $path; $data['content'] = addslashes(trim($content)); $data['status'] = 1; $result = News::modify($v['id'],$data); if(!$result){ file_put_contents("/data/log/fail_spiderdown.log",$v['id']."|".var_dump($result)."|".json_encode($data)."\r\n",FILE_APPEND); } }else if(strpos($v['source_url'],'36kr')){ // echo $v['id']."\r\n"; $path = download2($v['pic']);//下載圖片到本地 $re = file_get_contents($v['source_url']); //設置好抓取的新聞列表網址 preg_match("/<script>var props=(.*),locationnal={/",$re,$match); $info = json_decode($match[1],true); $content = $info['detailArticle|post']['content']; $data['pic'] = $path; $data['content'] = $content; $data['status'] = 1; $result = News::modify($v['id'],$data); // print_r($data); // break; $result = News::modify($v['id'],$data); if(!$result){ file_put_contents("/data/log/fail_spiderdown.log",$v['id']."|".var_dump($result)."|".json_encode($data)."\r\n",FILE_APPEND); } } }
第一種仍是用phpquery抓取。第二種查看源代碼,它是js數據懶加載的,因此我直接php正則匹配我須要的數據。 其中我把兩者的封面圖都下載到本地了,本地的upimg主要要給權限,不然建立日期目錄可能會失敗。還有一點,我對source_url 也就是目標網址 mysql字段作了惟一索引,這樣 我天天2個腳本定時跑,能夠抓到最新數據,同時又不會抓到重複數據。json