首先, 建立數據表
CREATE TABLE `douban_movie` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` varchar(255) NOT NULL,
`img` varchar(255) NOT NULL,
`info` varchar(255) NOT NULL,
`rank` int(11) NOT NULL,
`score` decimal(2,1) NOT NULL,
`comment` varchar(255) NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
複製代碼
編寫爬蟲代碼
<?php
require 'vendor/autoload.php';
use Medoo\Medoo;
use QL\QueryList;
$database = new medoo([
'database_type' => 'mysql',
'database_name' => 'demo_db',
'server' => 'localhost',
'username' => 'root',
'password' => 'root',
'charset' => 'utf8',
]);
function get_data($url) {
echo "function get_data is running .... \n";
echo $url . "\n";
$ql = new QueryList();
$html_source = get_html_source($url);
$data = $ql->html($html_source)->rules([
'title' => ['#content > div > div.article > ol > li > div > div.info > div.hd > a > span:nth-child(1)', 'text'],
'img' => ['#content > div > div.article > ol > li > div > div.pic > a > img', 'src'],
'rank' => ['#content > div > div.article > ol > li > div > div.pic > em', 'text'],
'score' => ['#content > div > div.article > ol > li > div > div.info > div.bd > div > span.rating_num', 'text'],
'info' => ['#content > div > div.article > ol > li > div > div.info > div.bd > p:nth-child(1)', 'html'],
])->queryData();
foreach ($data as $key => $value) {
$num = $key + 1;
$value['comment'] = $ql->html($html_source)->find("#content > div > div.article > ol > li:nth-child({$num}) > div > div.info > div.bd > p.quote > span")->text();
if (!$value['comment']) {
$value['comment'] = "好看到無話可說!";
}
$data[$key] = $value;
}
return $data;
}
function save_data($data) {
echo "function save_data is running .... \n";
global $database;
$database->insert('douban_movie250', $data);
}
function get_html_source($url) {
echo "function get_html_source is running .... \n";
$result = false;
while (!$result) {
$targetUrl = $url;
$proxyServer = "http://http-dyn.abuyun.com:9020";
$proxyUser = "H19D75L76VK89Q8D";
$proxyPass = "8C17B0A80F475BD8";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $targetUrl);
curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
curl_setopt($ch, CURLOPT_PROXY, $proxyServer);
curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
curl_setopt($ch, CURLOPT_PROXYUSERPWD, "{$proxyUser}:{$proxyPass}");
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;)");
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
if (!$result) {
sleep(2);
}
curl_close($ch);
}
return $result;
}
for ($i = 0; $i < 10; $i++) {
$page = $i * 25;
$url = "https://movie.douban.com/top250?start=" . $page;
save_data(get_data($url));
}
複製代碼
看一下最終數據庫的效果
從數據中讀取數據, 寫入markdown文件
<?php
require 'vendor/autoload.php';
use Medoo\Medoo;
$database = new medoo([
'database_type' => 'mysql',
'database_name' => 'demo_db',
'server' => 'localhost',
'username' => 'root',
'password' => 'root',
'charset' => 'utf8',
]);
function get_data(){
global $database;
$data = $database->select('douban_movie250','*');
foreach ($data as $key => $value) {
$tmp = explode('<br>',$value['info']);
$value['movie_info'][0] = trim($tmp[0]);
$value['movie_info'][1] = trim($tmp[1]);
$data[$key] = $value;
}
return $data;
}
function make_markdown($data){
$md_obj = fopen('douban_markdown.md','w+');
foreach($data as $key => $value){
$title = $value['title'];
$img = $value['img'];
$rank = $value['rank'];
$score = $value['score'];
$info = $value['info'];
$comment = $value['comment'];
$movie_info = $value['movie_info'];
fwrite($md_obj,"![]({$img})\n");
fwrite($md_obj,"## {$rank}-{$title}-{$score}\n");
fwrite($md_obj,"```\n");
fwrite($md_obj,"{$movie_info[0]}\n");
fwrite($md_obj,"{$movie_info[1]}\n");
fwrite($md_obj,"```\n");
fwrite($md_obj,"> ### {$comment}\n");
fwrite($md_obj,"---\n");
}
fclose($md_obj);
}
make_markdown(get_data());
複製代碼
最終效果