JB的Python之旅-爬蟲篇-MM圖

時間 2019-11-30

標籤 python 之旅爬蟲欄目 Python 简体版

原文原文鏈接

前言：

某羣裏某大學生每天在羣裏說XX網的圖片好看，每天喊爬，但又不去作，實在看不下去了，便深夜有了此篇；
後續MM圖的爬取，計劃所有放到這裏，有質量高的網站，能夠留言，有空爬下~html

妹子圖網

不解釋，以前有篇requests的文章，代碼如出一轍，只須要處理下xpath便可服務器

import requests
from lxml import html
import os

dir = "meizitupic/"


def Get_Page_Number(num):
    url = 'http://www.mzitu.com/page'+ str(num)
    response = requests.get(url).content
    # 調用requests庫，獲取二進制的相應內容。
    # 注意，這裏使用.text方法的話，下面的html解析會報錯.這裏涉及到.content和.text的區別了。簡單說，若是是處理文字、連接等內容，建議使用.text，處理視頻、音頻、圖片等二進制內容，建議使用.content。
    selector = html.fromstring(response)
    # 使用lxml.html模塊構建選擇器，主要功能是將二進制的服務器相應內容response轉化爲可讀取的元素樹（element tree）。lxml中就有etree模塊，是構建元素樹用的。若是是將html字符串轉化爲可讀取的元素樹，就建議使用lxml.html.fromstring，畢竟這幾個名字應該能大體說明功能了吧。
    urls = []
    # 準備容器
    for i in selector.xpath("//div[@class='postlist']/ul[@id='pins']/li/a/@href"):
        # 利用xpath定位到全部的套圖的詳細地址
        urls.append(i)
    #     # 遍歷全部地址，添加到容器中
    return urls


def Get_Image_Title(url):
    # 如今進入到套圖的詳情頁面了，如今要把套圖的標題和圖片總數提取出來
    response = requests.get(url).content
    selector = html.fromstring(response)
    image_title = selector.xpath("//div[@class='content']/h2/text()")[0]
    # 須要注意的是，xpath返回的結果都是序列，因此須要使用[0]進行定位
    return image_title


def Get_Image_Count(url):

    response = requests.get(url).content
    selector = html.fromstring(response)
    image_count = selector.xpath("//div[@class='pagenavi']/a[last()-1]/text()")[0]
    return image_count


def Get_Image_Url(url):
    #獲取圖片的下載連接
    response = requests.get(url).content
    selector = html.fromstring(response)
    image_links = []
    image_aount = selector.xpath("//div[@class='pagenavi']/a[last()-1]/span/text()")[0]
    # 獲取網頁總數，直接讀取下一頁那排的倒數第二個參數，通常是頁數的總數，拿這個總數去遍歷

    for i in range(int(image_aount)):
        image_url = url + "/" + str(i + 1)
        response = requests.get(image_url).content
        sel = html.fromstring(response)
        image_download_link = sel.xpath("//div[@class='main-image']/p/a/img/@src")[0]
        # 這裏是單張圖片的最終下載地址
        image_links.append(str(image_download_link))
    return image_links


def Download_Image(image_title, image_links):
    num = 1
    amount = len(image_links)

    if not os.path.exists(dir):
        os.makedirs(dir)
    for i in image_links:
        if not os.path.exists(dir+image_title):
            os.makedirs(dir+image_title)
        print('正在下載圖片：%s第%s/%s張，' % (image_title, num, amount))
        # 用於在cmd窗口上輸出提示，感受能夠增長一個容錯函數，沒想好怎麼寫
        filename = image_title+"/"+str(num)+".jpg"
        #建立文件名
        with open(dir+filename, 'wb') as f:
        #以二進制寫入的模式在本地構建新文件
            header = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.32 Safari/537.36',
                'Referer':i}
            f.write(requests.get(i,headers=header).content)
        # urllib.request.urlretrieve(requests.get(i,headers=header), "%s%s%s.jpg" % (dir, image_title, num))
        #若是使用這種方式爬，網站會返回防盜連接，爬的圖片都同樣，所以須要爬的時候UA作下處理，而urlretrieve並無設置請求頭的方式，所以不適用本案例
        num += 1



if __name__ == '__main__':
    page_number = input('請輸入須要爬取的頁碼：')
    for link in Get_Page_Number(1):
        Download_Image(Get_Image_Title(link), Get_Image_Url(link))
複製代碼

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。