本文正在參加「Python主題月」,詳情查看 活動連接html
何爲爬蟲:網絡爬蟲(又稱爲網頁蜘蛛,網絡機器人,在FOAF社區中間,更常常的稱爲網頁追逐者),是一種按照必定的規則,自動地抓取萬維網信息的程序或者腳本。另一些不常使用的名字還有螞蟻、自動索引、模擬程序或者蠕蟲。百度百科詳情python
隨着大數據時代的到來,人們對數據資源的需求愈來愈多,而爬蟲是一種很好的自動採集數據的手段。推薦一個Python網絡爬蟲學習路線解讀git
分享幾個Python學習鏈接:github
1.請叫我汪海 的CSDNwindows
2.廖雪峯大佬的教程和廖雪峯大佬的視頻版教程數組
3.爬蟲小白入門markdown
4.爬蟲框架 Scrapy網絡
官網下載Pythonapp
前人種樹,後人乘涼,感謝!
框架
使用pip install ***
安裝對應依賴
pip install urllib
複製代碼
我這使用的時候提示須要升級pip python -m pip install --upgrade pip
python main.py
複製代碼
運行,發現成功下載圖片。
動手試試有驚喜或者驚喜(Practice make perfect!)。
# -*- coding:utf-8 -*-
import os
import random
import ssl
import time
import urllib.request
from bs4 import BeautifulSoup
# 請求頭配置
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
# 下載地址
BASE_URL = "https://www.mzitu.com"
# 保存圖片文件夾地址
BASE_DIR = "../images"
def start_work(serial_id):
picture_dir = BASE_DIR + os.sep + serial_id
if not os.path.exists(picture_dir):
os.mkdir(picture_dir)
page_count = get_page_count(serial_id)
print("%s 共%d個圖片" % (serial_id, page_count))
get_image_for_serial(picture_dir,serial_id,page_count)
# 獲取頁數
def get_page_count(serial_id):
header = {"user-agent": USER_AGENT}
context = ssl._create_unverified_context()
url = "%s/%s" % (BASE_URL, serial_id)
req = urllib.request.Request(url, headers=header)
resp = urllib.request.urlopen(req, context=context)
content = resp.read()
str_content = content.decode("utf-8")
total_count = __get_counts(str_content)
return total_count
# 獲取數量
def __get_counts(html_content):
page_count = 0
soup = BeautifulSoup(html_content, 'lxml')
data = soup.select("body > div.main > div.content > div.pagenavi > a > span")
if data and len(data) >= 3:
page_count = int(data[-2].get_text())
return page_count
# 獲取圖片地址
def get_image_url(html_content):
soup = BeautifulSoup(html_content, 'lxml')
data = soup.select("body > div.main > div.content > div.main-image > p > a > img")
url = None
try:
url = data[0].get("src")
except Exception as ex:
print("exception occur:%s" % ex)
return url
# 獲取圖片地址數組
def get_all_image_urls(serial_id, page_count):
url_list=list()
header = {"user-agent": USER_AGENT}
context = ssl._create_unverified_context()
if page_count <= 1:
return
for x in range(1,page_count+1):
print("獲取第%d張圖片的地址" % x)
url = "%s/%s/%s" % (BASE_URL, serial_id, x)
req = urllib.request.Request(url, headers=header)
resp = urllib.request.urlopen(req, context=context)
content = resp.read()
str_content = content.decode("utf-8")
img_url = get_image_url(str_content)
if img_url:
url_list.append(img_url)
print("第%d張圖片地址是:%s" % (x, img_url))
time.sleep(random.randint(1, 2))
return url_list
# 獲取圖片
def get_image_for_serial(dir_path, serial_id, total_count):
for i in range(1, total_count + 1):
print("開始獲取第%d張圖片" % i)
get_image_for_index(dir_path, serial_id, i)
sleep_seconds = random.randint(1, 10) /10
time.sleep(sleep_seconds)
# 獲取具體圖片
def get_image_for_index(dir_path, serial_id, page_index):
header = {"user-agent": USER_AGENT}
context = ssl._create_unverified_context()
print("獲取第%d張圖片的地址" % page_index)
ref_url = "%s/%s/%s" % (BASE_URL, serial_id, page_index)
req = urllib.request.Request(ref_url, headers=header)
resp = urllib.request.urlopen(req, context=context)
content = resp.read()
str_content = content.decode("utf-8")
img_url = get_image_url(str_content)
if img_url:
print("第%d張圖片地址是:%s" % (page_index, img_url))
print("嘗試保存圖片%s" % img_url)
save_img(dir_path, img_url, ref_url)
# 保存圖片
def save_imgs(dir_path, img_urls):
for img_addr in img_urls:
save_img(dir_path, img_addr)
# 保存具體圖片
def save_img(dir_path, img_url, ref_url):
header = {
"user-agent": USER_AGENT,
"Referer": ref_url
}
context = ssl._create_unverified_context()
req = urllib.request.Request(img_url, headers=header)
resp = urllib.request.urlopen(req, context=context)
content = resp.read()
with open(dir_path+os.sep+img_url.split('/')[-1], 'wb') as f:
f.write(content)
f.close()
print("已向目錄:%s 保存文件:%s" % (dir_path, img_url.split('/')[-1]))
time.sleep(random.randint(1, 2))
if __name__ == "__main__":
vol_list = ["204061"]
for serial_id in vol_list:
start_work(serial_id)
複製代碼
我學會了Python嘛?並無!!我只是成功安裝了python,而後成功運行了一個用例,算是爬蟲的一個實踐。我如今會的只是一些基礎類型和函數。