分析網站
須要爬取的數據
先創建數據表, 用來保存電影數據
CREATE TABLE `movies` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` varchar(255) NOT NULL,
`rank` varchar(255) NOT NULL,
`score` varchar(255) NOT NULL,
`actor` varchar(255) NOT NULL,
`date` varchar(255) NOT NULL,
`img` varchar(255) NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
複製代碼
使用IP代理池獲取數據, 並打印輸出
<?php
require 'vendor/autoload.php';
use QL\QueryList;
$base = "https://maoyan.com/board/4?offset=";
for ($i=0; $i < 10; $i++) {
$page = $i*10;
$url = $base.$page;
echo $url."\n";
get_content($url);
}
function get_content($url){
$data = QueryList::html(get_html_source($url))->rules([
'title'=>['#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.name > a','title'],
'rank'=>['#app > div > div > div.main > dl > dd > i','text'],
'img'=>['#app > div > div > div.main > dl > dd > a > img.board-img','data-src'],
'actor'=>['#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.star','text'],
'date'=>['#app > div > div > div.main > dl > dd> div > div > div.movie-item-info > p.releasetime','text'],
'score'=>['#app > div > div > div.main > dl > dd > div > div > div.movie-item-number.score-num > p','text'],
])->queryData();
var_dump($data);
}
function get_html_source($url) {
$targetUrl = $url;
$proxyServer = "http://http-dyn.abuyun.com:9020";
$proxyUser = "H19D75L76VK89Q8D";
$proxyPass = "8C17B0A80F475BD8";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $targetUrl);
curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
curl_setopt($ch, CURLOPT_PROXY, $proxyServer);
curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
curl_setopt($ch, CURLOPT_PROXYUSERPWD, "{$proxyUser}:{$proxyPass}");
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;)");
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
curl_close($ch);
return $result;
}
複製代碼
把數據保存到數據庫
<?php
require 'vendor/autoload.php';
use QL\QueryList;
use Medoo\Medoo;
$database = new medoo([
'database_type' => 'mysql',
'database_name' => 'demo_db',
'server' => 'localhost',
'username' => 'root',
'password' => 'root',
'charset' => 'utf8',
]);
function get_data($url) {
echo "function get_data is running ... \n";
$data = QueryList::html(get_html_source($url))->rules([
"img" => ['#app > div > div > div.main > dl > dd > a > img.board-img', 'data-src'],
"actor" => ['#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.star', 'text'],
"date" => ['#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.releasetime', 'text'],
"rank" => ['#app > div > div > div.main > dl > dd > i', 'text'],
"score" => ['#app > div > div > div.main > dl > dd > div > div > div.movie-item-number.score-num > p', 'text'],
"title" => ['#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.name > a', 'text'],
])->queryData();
return $data;
}
function save_movie($data){
echo "function save_movie is running ... \n";
global $database;
$database->insert('movies',$data);
}
function get_html_source($url) {
echo "function get_html_source is running ... \n";
$result = false;
while (!$result) {
$targetUrl = $url;
$proxyServer = "http://http-dyn.abuyun.com:9020";
$proxyUser = "H19D75L76VK89Q8D";
$proxyPass = "8C17B0A80F475BD8";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $targetUrl);
curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
curl_setopt($ch, CURLOPT_PROXY, $proxyServer);
curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
curl_setopt($ch, CURLOPT_PROXYUSERPWD, "{$proxyUser}:{$proxyPass}");
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;)");
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
if (!$result) {
sleep(2);
}
curl_close($ch);
}
return $result;
}
$base_url = "https://maoyan.com/board/4?offset=";
for ($i = 0; $i < 100; $i += 10) {
$current_url = $base_url . $i;
echo $current_url;
$data = get_data($current_url);
save_movie($data);
}
複製代碼
提取數據並生成markdown文檔
<?php
require 'vendor\autoload.php';
use Medoo\Medoo;
$database = new medoo([
'database_type' => 'mysql',
'database_name' => 'demo_db',
'server' => 'localhost',
'username' => 'root',
'password' => 'root',
'charset' => 'utf8',
]);
function get_data(){
global $database;
$data = $database->select('movies','*');
return $data;
}
function make_markdown($data){
$md_obj = fopen('maoyanmd.md','w+');
foreach ($data as $key => $value) {
$img = $value['img'];
$img = str_replace("@160w_220h_1e_1c","@320w_440h_1e_1c",$img);
$rank = $value['rank'];
$title = $value['title'];
$score = $value['score'];
$date = $value['date'];
$actor = $value['actor'];
fwrite($md_obj,"\n");
fwrite($md_obj,"## {$rank}-{$title}-{$score}\n");
fwrite($md_obj,"> {$date}\n");
fwrite($md_obj,"> {$actor}\n");
fwrite($md_obj,"---\n\n\n");
}
fclose($md_obj);
}
make_markdown(get_data());
複製代碼
做業
- 爬取豆瓣電影top250, 把結果保存到數據庫
- 從數據庫中提取電影信息, 作成markdown文件展現