PHP爬蟲 -- 018 做業源碼(豆瓣top250)

首先, 建立數據表

CREATE TABLE `douban_movie` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(255) NOT NULL,
  `img` varchar(255) NOT NULL,
  `info` varchar(255) NOT NULL,
  `rank` int(11) NOT NULL,
  `score` decimal(2,1) NOT NULL,
  `comment` varchar(255) NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
複製代碼

編寫爬蟲代碼

<?php
require 'vendor/autoload.php';
use Medoo\Medoo;
use QL\QueryList;
// 建立數據庫鏈接
$database = new medoo([
    'database_type' => 'mysql',
    'database_name' => 'demo_db',
    'server' => 'localhost',
    'username' => 'root',
    'password' => 'root',
    'charset' => 'utf8',
]);

/* * @Description: 經過目標url, 獲取對應網頁中咱們須要的數據 * @param: 目標url * @return: 包含數據的二維數組 */ 
function get_data($url) {
    echo "function get_data is running .... \n";
    echo $url . "\n";
    $ql = new QueryList();
    $html_source = get_html_source($url);
    $data = $ql->html($html_source)->rules([
        'title' => ['#content > div > div.article > ol > li > div > div.info > div.hd > a > span:nth-child(1)', 'text'],
        'img' => ['#content > div > div.article > ol > li > div > div.pic > a > img', 'src'],
        'rank' => ['#content > div > div.article > ol > li > div > div.pic > em', 'text'],
        'score' => ['#content > div > div.article > ol > li > div > div.info > div.bd > div > span.rating_num', 'text'],
        'info' => ['#content > div > div.article > ol > li > div > div.info > div.bd > p:nth-child(1)', 'html'],
    ])->queryData();
    // 針對comment單獨處理, 若是沒有, 就自定義comment內容
    foreach ($data as $key => $value) {
        $num = $key + 1;
        $value['comment'] = $ql->html($html_source)->find("#content > div > div.article > ol > li:nth-child({$num}) > div > div.info > div.bd > p.quote > span")->text();
        if (!$value['comment']) {
            $value['comment'] = "好看到無話可說!";
        }
        $data[$key] = $value;
    }
    // 返回數據
    return $data;
}
/* * @Description: 保存數據 * @param: 二維數組 * @return: 沒有返回值 */ 
function save_data($data) {
    echo "function save_data is running .... \n";
    global $database;
    $database->insert('douban_movie250', $data);
}
/* * @Description: 使用IP代理訪問url並獲取html代碼 * @param: 目標url * @return: html代碼 */ 
function get_html_source($url) {
    echo "function get_html_source is running .... \n";
    $result = false;
    while (!$result) {
        $targetUrl = $url;
        $proxyServer = "http://http-dyn.abuyun.com:9020";
        $proxyUser = "H19D75L76VK89Q8D";
        $proxyPass = "8C17B0A80F475BD8";
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $targetUrl);
        curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
        curl_setopt($ch, CURLOPT_PROXY, $proxyServer);
        curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
        curl_setopt($ch, CURLOPT_PROXYUSERPWD, "{$proxyUser}:{$proxyPass}");
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;)");
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
        curl_setopt($ch, CURLOPT_TIMEOUT, 5);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        $result = curl_exec($ch);
        if (!$result) {
            sleep(2);
        }
        curl_close($ch);
    }
    return $result;

}

// 遍歷, 生成須要的10條url, 而後從每條url中獲取數據, 並保存到數據庫
for ($i = 0; $i < 10; $i++) {
    $page = $i * 25;
    $url = "https://movie.douban.com/top250?start=" . $page;
    save_data(get_data($url)); // 獲取數據並保存
}
複製代碼

看一下最終數據庫的效果

從數據中讀取數據, 寫入markdown文件

<?php
require 'vendor/autoload.php';
use Medoo\Medoo;
// 建立數據庫鏈接
$database = new medoo([
    'database_type' => 'mysql',
    'database_name' => 'demo_db',
    'server' => 'localhost',
    'username' => 'root',
    'password' => 'root',
    'charset' => 'utf8',
]);

/* * @Description: 從數據庫中獲取數據 * @return: 整理後的數據 */ 
function get_data(){
    global $database;
    $data = $database->select('douban_movie250','*');
    // 咱們須要把原來的電影信息, 分紅兩行
    foreach ($data as $key => $value) {
        $tmp = explode('<br>',$value['info']);
        $value['movie_info'][0] = trim($tmp[0]);
        $value['movie_info'][1] = trim($tmp[1]);
        $data[$key] = $value;
    }
    return $data;
}

/* * @Description: 經過遍歷數組, 生成markdown文檔 * @param: 二維數組 * @return: 沒有返回值 */ 
function make_markdown($data){
    $md_obj = fopen('douban_markdown.md','w+');
    foreach($data as $key => $value){
        $title = $value['title'];
        $img = $value['img'];
        $rank = $value['rank'];
        $score = $value['score'];
        $info = $value['info'];
        $comment = $value['comment'];
        $movie_info = $value['movie_info'];
        fwrite($md_obj,"![]({$img})\n");
        fwrite($md_obj,"## {$rank}-{$title}-{$score}\n");
        fwrite($md_obj,"```\n");
        fwrite($md_obj,"{$movie_info[0]}\n");
        fwrite($md_obj,"{$movie_info[1]}\n");
        fwrite($md_obj,"```\n");
        fwrite($md_obj,"> ### {$comment}\n");
        fwrite($md_obj,"---\n");
    }
    fclose($md_obj);
}
// 獲取數組, 並生成markdown文檔
make_markdown(get_data());
複製代碼

最終效果

相關文章
相關標籤/搜索