Python爬蟲 - wallhaven任意頁面下的壁紙批量下載

時間 2021-08-12

標籤 git github 正則表達式 dom url code 圖片 get requests 欄目 Python 简体版

原文原文鏈接

基於Python 任意頁面下的壁紙批量下載
github
Maxpagenum 爬取頁數
fpath 保存路徑
url 基礎地址

import requests
import re
import time
import os
#爬取頁數
Maxpagenum = 10
Sleeptime =0.1
def creatPath(path):
    if not os.path.exists(path):
        print("Creat path")
        os.makedirs(path)

if __name__ == '__main__':

    #建立文件夾路徑
    fpath = "D:\Download\pic"

    creatPath(path=fpath)
    #源地址'https://wallhaven.cc/search?q=id%3A2278&sorting=random&ref=fp&seed=ZYNEUQ&page=2' 'https://wallhaven.cc/hot''https://wallhaven.cc/hot?page=4'...

    #圖片列表連接
    url = 'https://wallhaven.cc/search?q=id%3A4641&page=4'

    #初始化
    pagenum = 0
    picnum = 0
    #獲取每個page
    while pagenum<Maxpagenum:
        headers = {
            'referer': url + 'page = ' + str(pagenum),
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
        }
        pagenum = pagenum + 1
        par = {

            'page': str(pagenum)
        }
        img_data = requests.get(url=url,headers=headers,params=par).text
        #獲取圖片詳情頁連接的正則表達式
        ex = '<a class="preview" href="(.*?)"  target="_blank"  ></a>'
        img_src_list = re.findall(ex,img_data,re.S)

        #獲取圖片連接的正則表達式
        img_url_ex = '<img id="wallpaper" src="(.*?)" alt'

        # 從詳情頁獲取圖片連接
        for src in img_src_list:
            time.sleep(Sleeptime)

            img_page = requests.get(url=src,headers=headers).text
            img_url = re.findall(img_url_ex,img_page,re.S)[0]
            img_data = requests.get(url=img_url).content
            img_name = img_url.split('/')[-1]
            img_path = fpath+'/'+img_name
            fp = open(img_path, 'wb')
            fp.write(img_data)
            print("finish " + str(picnum))
            picnum += 1