寫在開頭 1.因爲業務須要,須要進行半自動化的遊戲資訊採集。17173是國內對標遊戲資訊相對豐富與更新比較勤的網站,所以也成了採集目標之一。 2.QueryList是一款開源的漸進式PHP採集框架,上手容易(從入門到採集到數據用了大約半小時,不含後期數據清洗)。php
使用框架:Lumen+QueryList 安裝QueryList Composer包html
composer require jaeger/querylist
複製代碼
routes路由添加json
$router->get('/gather','Headline\GatherController@get_content');
複製代碼
傳參數組
pageCount(頁數)
type(類型)
handle(中轉,'17173')
如:/gather?pageCount=1&type=1&handle=17173
複製代碼
返回結果bash
{
msg:採集成功
code:200
count:20
}
複製代碼
Controller代碼composer
<?php
/**
* Created by PhpStorm.
* User: kopa
* Date: 12/24/18
* Time: 11:17 AM
*/
namespace App\Http\Controllers\Headline;
use App\Service\Common\RewardService;
use App\Http\Controllers\Controller;
use App\Service\Headline\RulesService;
use Illuminate\Http\Request;
Class GatherController extends Controller
{
/**
* @methods(GET)
* @param Request $request
* @return \Illuminate\Http\JsonResponse
* 獲取規則
* 目前狀況建議採集首頁內容,其餘內容容易出錯
*/
public function get_content(REQUEST $request)
{
$pageCount = $request->post('p // ageCount'); //獲取多少頁內容
$type = $request->post('type'); //
$handle = $request->post('handle');
$pageCount = empty($pageCount) ? 1 : $pageCount;
$service = new RulesService();
try{
switch ($handle){
case '17173':
$result = $service->rules($pageCount,$type);
break;
case '9you':
$result = $service->nineGameRules($pageCount,$type);
break;
default:
$result = ['code'=>0,'message'=>'出錯了'];
break;
}
return $result;
}catch(\Exception $e){
}
}
}
複製代碼
Service業務層代碼框架
<?php
/**
* Created by PhpStorm.
* User: whoami
* Date: 18-12-25
* Time: 下午3:47
*/
namespace App\Service\Headline;
use App\Models\bgcc\TabHeadlineArticle;
use App\Models\bgcc\TabHeadlineArticleGather;
use Illuminate\Support\Facades\DB;
use QL\QueryList;
class RulesService
{
/**
* 17173採集規則
* type 1 採集第一頁 2 指定頁數
*/
public function rules($pageCount,$type=1)
{
$res = [];
if($type == 2 && $pageCount > 1){
for ($i=1;$i<=$pageCount;$i++){
$url = 'http://news.17173.com/data/content/list.json?pageSize=10&pageNo='.$i;
$getRule[$i] = $this->rulesString('17173',$url,$type);
$totalCount = $getRule[$i]['totalCount'];
$titleData[$i] = $getRule[$i]['data'];
}
foreach ($titleData as $key =>$val){
foreach ($val as $k=>$v){
for ($i=0;$i<count($v);$i++){
$res[$i]['title'] = $v['title'];
$res[$i]['img'] = empty($v['imgPath'])?'default.jpg':ltrim($v['imgPath'],'/');
$res[$i]['link'] = $v['pageUrl'];
}
}
}
if(empty($res)){
$data = ['data'=>null];
return $data;
}
$result = $this->getGameContent('17173',$res);
$data = ['totalCount'=>$totalCount,'titleCount'=>count($result),'data'=>$result];
return $result;
}
$url = 'http://news.17173.com';
$getRule = $this->rulesString('17173',$url);
$imgData = $getRule['imgData'];
$titleData = $getRule['titleData'];
foreach ($titleData as $key =>$val){
foreach ($imgData as $k =>$v){
if(empty($v['link']) || empty(strstr($v['link'],'com')))
unset($imgData[$k]);
if($val['link'] == $v['link']){
$res[$key]['title'] = $val['title'];
$res[$key]['img'] = ltrim($v['img'],'/');
$res[$key]['link'] = $v['link'];
}
}
}
$result = $this->getGameContent('17173',$res);
if(empty($res)){
$data = ['data'=>null];
return $data;
}
// print_r($result);
$return = $this->cleanData($result['data'],$url,'17173','17173');
// $data = ['titleCount'=>count($result),'data'=>$result];
return $return;
}
/**
* @param $type 1 首頁 2 頁數
* 獲取九遊新聞列表內容
*/
public function nineGameRules($page,$type=1)
{
$res = [];
//採集頁數暫時不可用
// if($type == 2 && $page > 1){
// for ($i=1;$i<=$page;$i++){
// $url = 'http://www.9game.cn/news/0_'.$i; //最新新聞資訊
// $getRule[$i] = $this->rulesString('9you',$url,$type);
// }
// $content = $this->getGameContent('9you',$getRule,$type);
// foreach ($content as $key =>$val){
// $res[] = $val[0];
// }
// return $res;
// }
$return = [];
$url = 'http://www.9game.cn/news/0_1';
$getRule = $this->rulesString('9you',$url,$type);
// print_r($getRule);
$content = $this->getGameContent('9you',$getRule,$type);
$return = $this->cleanData($content,$url,'九遊','9you');
return $return;
}
public function rulesString($name,$url,$type=1)
{
$result = [];
switch ($name){
case '17173':
if($type == 2){
$ql = QueryList::get($url);
$title = $ql->getHtml();
$result = json_decode(json_decode(json_encode($title)),true);
return $result;
}
$titleRules = [
'title' =>['h2','text'],
'link' =>['a','href']
];
$range = '.text';
$titleData = QueryList::get($url)->rules($titleRules)->range($range)->queryData();
$imgRules = [
'img'=>['img','src'],
'link'=>['a','href']
];
$imgRang='.pic';
$imgData = QueryList::get($url)->rules($imgRules)->range($imgRang)->queryData();
$result = ['titleData'=>$titleData,'imgData'=>$imgData];
return $result;
break;
case '9you':
$stringUrl = 'http://www.9game.cn';
$titleRules = [
'title'=>['h2','text'],
'link'=>['a','href'],
'dates'=>['.time','text']
];
$titleRange = '.title';
$titleData = QueryList::get($url)->rules($titleRules)->range($titleRange)->queryData();
foreach ($titleData as $key =>$val){
$result[$key]['title'] = $val['title'];
$result[$key]['link'] = $stringUrl.$val['link'];
$result[$key]['dates'] = strtotime(preg_replace('/([\x80-\xff]*)/i','',$val['dates']));
}
return $result;
break;
default :
break;
}
return $result;
}
/**
* @param (GET)
* @經過網址獲取對應文章內容
*/
public function getGameContent($name,$data,$type = null)
{
switch($name){
case '17173':
$result = [];
$rules = [
"title"=>["h1","text"],
"dates"=>[".gb-final-date","text"],
"content"=>['#mod_article','html']
];
$range = '.gb-final-pn-article';
foreach ($data as $key =>$val)
{
$result[$key]= QueryList::get($val['link'])->rules($rules)->range($range)->query()->getData();
$result[$key] = json_decode(json_encode($result[$key]),true);
}
$count = count($result);
$res = ['count'=>$count,'data'=>$result];
return $res;
break;
case '9you':
$range = '.left-con';
$rules = [
'title'=>['.text-title h1','text'],
'content'=>['.text-con','html'],
'dates'=>['.summary','text']
];
//首頁內容
if($type == 1){
foreach ($data as $key =>$val){
$result[$key] = QueryList::get($val['link'])->rules($rules)->range($range)->query()->getData();
$result[$key] = json_decode(json_encode($result[$key]), true);
}
foreach($result as $k =>$v){
if(!$v)
unset($result[$k]);
}
}else{
//多頁內容
foreach ($data as $key => $val) {
foreach ($val as $k => $v) {
$result[$k] = QueryList::get($v['link'])->rules($rules)->range($range)->query()->getData();
$result[$k] = json_decode(json_encode($result[$k]), true);
}
}
}
return $result;
break;
default:
break;
}
}
/**
* @param $data
* @param $url
* @param $author
* @param $name
* @return mixed
* @清洗數據,從新歸類
*/
public function cleanData($data,$url,$author,$name)
{
$model = new TabHeadlineArticle();
foreach ($data as $key =>$val) {
foreach ($val as $get => $datas) {
$return[$key]['article_title'] = $datas['title'];
$return[$key]['article_content'] = $datas['content'];
if($name == '9you'){
$return[$key]['article_create_time'] = strtotime(preg_replace('/([\x80-\xff]*)/i', '', $datas['dates']));
}else{
$return[$key]['article_create_time'] = time();
}
$return[$key]['article_author'] = $author;
$return[$key]['article_come'] = $url;
$return[$key]['article_type'] = 3;
$return[$key]['article_upload_video'] = 0;
$return[$key]['article_tags'] = 0;
$return[$key]['status'] = 0;
$return[$key]['article_cate_id'] = 0;
$return[$key]['article_cover_image'] = !empty($datas['article_cover_image']) ?$datas['article_cover_image']:0;
$return[$key]['md5'] = md5($datas['title'].$author);
}
}
//檢查表中是否已經存在相同的標題,若是是,則刪除數組中的
$checkData = $this->checkTitle($return);
//若是數據所有重複,則爲false
if($checkData == false){
$res = ['msg'=>'暫時沒有新的數據','code'=>0];
DB::table('tab_headline_article_gather')->truncate();
//TabHeadlineArticleGather::where(['status','=',0])->update(['status'=>1]);
return $res;
}
//清理重複數據後,直接插入
if(is_array($checkData)){
TabHeadlineArticle::insert($checkData);
$res = ['code'=>200,'count'=>count($return),'msg'=>'採集成功'];
DB::table('tab_headline_article_gather')->truncate();
return $res;
}
}
/**
* 檢測是否已經存在文章或標題
* 寫入採集表
* 讀取採集表
* 對比完成,寫入article表
* 刪除採集表的內容
*/
public function checkTitle($data)
{
$findSample = [];
TabHeadlineArticleGather::insert($data);
//採集的數據先寫入gather表
$gather = DB::table('tab_headline_article_gather')->where('status','=',0)->get();
$gather = json_decode($gather,true);
$article = DB::table('tab_headline_article')->where('status',0)->get();
$article = json_decode($article,true);
//不然應該處理掉重複的標題數組,再返回
foreach ($article as $k =>$v){
foreach ($gather as $get=>$datum){
if($v['md5'] == $datum['md5']){
unset($gather[$get]);
}
}
}
if(empty($gather)){
return false; //刪除後若是爲空,則返回false
}
return $gather;
}
}
複製代碼