PHP爬蟲 -- 016 做業代碼

做業代碼

<?php

/** * _ooOoo_ * o8888888o * 88" . "88 * (| -_- |) * O\ = /O * ____/`---'\____ * .' \\| |// `. * / \\||| : |||// \ * / _||||| -:- |||||- \ * | | \\\ - /// | | * | \_| ''\---/'' | | * \ .-\__ `-` ___/-. / * ___`. .' /--.--\ `. . __ * ."" '< `.___\_<|>_/___.' >'"". * | | : `- \`.;`\ _ /`;.`/ - ` : | | * \ \ `-. \_ __\ /__ _/ .-` / / * ======`-.____`-.___\_____/___.-`____.-'====== * `=---=' * ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * 佛祖保佑 永無BUG */

require 'vendor/autoload.php';
use Medoo\Medoo;
use QL\QueryList;

// 生成一個querylist對象
$ql = new QueryList();

// 生成一個medoo數據庫對象
$database = new Medoo([
    'database_type' => 'mysql',
    'database_name' => 'bookstore',
    'server' => 'localhost',
    'username' => 'root',
    'password' => 'root',
    'charset' => 'utf8',
]);

/* * @Description: 使用代理ip, 獲取html源代碼 * @param: 目標url * @return: 目標url對應的html代碼 */
function get_html_source($url) {
    $result = false;
    while (!$result) {
        // 要訪問的目標頁面
        $targetUrl = $url;
        // 代理服務器
        $proxyServer = "http://http-dyn.abuyun.com:9020";
        // 隧道身份信息
        $proxyUser = "H19D75L76VK89Q8D";
        $proxyPass = "8C17B0A80F475BD8";
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $targetUrl);
        curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        // 設置代理服務器
        curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
        curl_setopt($ch, CURLOPT_PROXY, $proxyServer);
        // 設置隧道驗證信息
        curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
        curl_setopt($ch, CURLOPT_PROXYUSERPWD, "{$proxyUser}:{$proxyPass}");
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;)");
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
        curl_setopt($ch, CURLOPT_TIMEOUT, 5);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        $result = curl_exec($ch);
        curl_close($ch);
        if (!$result) {
            sleep(3);
        }
    }
    return $result;
}

/* * @Description:獲取分類 * @params: 主頁url * @return: 二維數組, 包括分類名稱, 分類url */
function get_category($url) {
    echo "function get_category is running.... \n";
    global $ql;

    $data = $ql->html(get_html_source($url))->rules([
        "category_name" => ['#default > div > div > div > aside > div.side_categories > ul > li > ul > li:nth-child(2) > a', 'text'],
        "category_url" => ['#default > div > div > div > aside > div.side_categories > ul > li > ul > li:nth-child(2) > a', 'href'],
    ])->queryData();
    // 把分類的連接地址補全
    foreach ($data as $key => $value) {
        $value['category_url'] = $url . $value['category_url'];
        $data[$key] = $value;
    }

    return $data;
}

/* * @Description: 獲取分類下的圖書信息, 若是有下一頁, 遞歸獲取 * @param: 分類的url * @return: 二維數組, 包括圖書名稱, 圖書價格 */
function get_book($url) {
    echo "function get_book is running.... \n";
    global $ql;
    echo $url . "\n";
    $data = $ql->html(get_html_source($url))->rules([
        "book_name" => ['#default > div > div > div > div > section > div:nth-child(2) > ol > li > article > h3 > a', 'title'],
        "book_price" => ['#default > div > div > div > div > section > div:nth-child(2) > ol > li> article > div.product_price > p.price_color', 'text'],
    ])->queryData();
    // 獲取下一頁按鈕的href, 用來拼接下一頁的完整url
    $next = has_next($url);
    if ($next) {
        // 生成完整url
        $tmp_arr = explode('/', $url);
        $tmp_arr[count($tmp_arr) - 1] = $next;
        $next_url = implode('/', $tmp_arr);
        // 調用get_book(), 把返回的數據和當前數據合併
        $data = array_merge($data, get_book($next_url));
    }
    return $data;
}
/* * @Description: 判斷有沒有下一頁 * @param: 當前url * @return: 若是有返回按鈕的href, 若是沒有, 返回空字符串 */
function has_next($url) {
    echo "function has_next is running.... \n";
    global $ql;
    $res = $ql->html(get_html_source($url))->find('#default > div > div > div > div > section > div:nth-child(2) > div > ul > li.next > a')->href;
    return $res;
}
/* * @Description: 生成最終的數組 * @param: 從分類獲取的數組 * @return: 最終整合了圖書信息的數組 */
function make_array($data) {
    echo "function make_array is running.... \n";
    foreach ($data as $key => $value) {
        $value['books'] = get_book($value['category_url']);
        $data[$key] = $value;
    }
    return $data;
}

/* * @Description: 把整合好的數據, 寫入數據庫 * @param: 整合好的數組 * @return: 沒有返回值 */
function save_data($data) {
    echo "function save_data is running.... \n";
    foreach ($data as $key => $value) {
        $bcid = create_category($value['category_name']);
        foreach ($value['books'] as $k => $book) {
            $bname = $book['book_name'];
            $bprice = $book['book_price'];
            create_book($bname, $bprice, $bcid);
        }
    }
}
/* * @Description:往類別表裏插入數據 * @param: 類別名稱 * @return: 沒有返回值 */
function create_category($category_name) {
    echo "function create_category is running.... \n";
    global $database;
    $id = $database->insert('category', [
        'cname' => $category_name,
    ]);
    return $database->id();
}

/* * @Description: 往圖書表中插入數據 * @param: 圖書名稱 * @param: 圖書價格 * @param: 圖書的分類id * @return: 沒有返回值 */
function create_book($bname, $bprice, $bcid) {
    echo "function create_book is running.... \n";
    global $database;
    $database->insert('book', [
        'bname' => $bname,
        'bprice' => $bprice,
        'bcid' => $bcid,
    ]);
}

// 爬取並整合數據
$data = make_array(get_category('http://books.toscrape.com/'));

// 把數據寫入books.txt
save_data($data);
複製代碼
相關文章
相關標籤/搜索