#!/usr/bin/env python # -*- coding:utf-8 -*- # 獲取網頁源碼/下載網頁/圖片/視頻/音頻.... import requests # 解析網頁相關數據 from lxml import etree # 操做文件夾/路徑 import os # 1. 下載網頁源碼 # 2. 解析網頁源碼(難度比較大) # 3. 存儲相關數據 url = "http://www.ivsky.com/tupian/ziranfengguang/" response = requests.get(url) # content text 只不過數據類型不同 # 把網頁源碼解析爲根節點 root = etree.HTML(response.content) # 根據xpath來定位相關數據 # ul li a # 注意:xpath返回的結果必定是個列表 a_list = root.xpath("//ul[@class='tpmenu']/li/a") # 對列表進行切片,跳過"全部分類"這個元素 # a_list = a_list[1:] for a in a_list[1:]: # print(a) # <Element *****> # text() 表示獲取標籤之間的文本內容 big_title = a.xpath("text()")[0] # 獲取標籤中的某個屬性 @屬性名稱 big_url = a.xpath("@href")[0] if not big_url.startswith("http"): big_url = "http://www.ivsky.com" + big_url # print(big_title, big_url) big_response = requests.get(big_url) big_root = etree.HTML(big_response.content) big_a_list = big_root.xpath("//div[@class='sline']/div/a") for big_a in big_a_list: small_title = big_a.xpath("text()")[0] small_url = big_a.xpath("@href")[0] if not small_url.startswith("http"): small_url = "http://www.ivsky.com" + small_url print(small_title, small_url) # D:\python項目\Django_Scrapy # \n 轉義字符,表示換行 # \\n 表示n # /n 表示n path = "images/" + big_title + "/" + small_title # 若是路徑對應的文件夾不存在,目的防止出現"文件夾已存在,建立失敗"錯誤 if not os.path.exists(path): # makedirs = MakeDirectorys 根據路徑建立文件夾 os.makedirs(path) page = 1 old_small_url = small_url while True: small_response = requests.get(small_url) small_root = etree.HTML(small_response.content) img_list = small_root.xpath("//div[@class='il_img']/a/img") if not img_list: break for idx, img in enumerate(img_list): src = img.xpath("@src")[0] # name = src.split("/")[-1] name = img.xpath("@alt")[0] + str(page) + "_" + str(idx) + ".jpg" img_response = requests.get(src) f = open(path+"/"+name, "wb") f.write(img_response.content) f.close() page += 1 # ziranfengguan/1.html # ziranfengguan/1.html/2.html small_url = old_small_url + "/index_%s.html" % page