1 python + requests + BeautifulSouphtml
主頁面:python
http://www.netbian.com/dongman/
圖片僞地址:app
http://www.netbian.com/desk/22371.htm
圖片真實地址:url
http://img.netbian.com/file/2019/1221/36eb674ba0633d185da078804a3638e6.jpg
1 導入庫.net
import requests from bs4 import BeautifulSoup import re
2 更改請求頭3d
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0" # "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", # "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", # "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0", # "Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
3 獲取主頁面的內容code
response = requests.get(url, headers={'User-Agent': ua}) html = response.text soup = BeautifulSoup(html, 'html.parser')
4 咱們要的是main裏的list中的li標籤中的a標籤的href,而不是a標籤裏的img標籤的src,若時獲取img裏的地址其大小爲 800*450
orm
list = soup.find(name='div', attrs='list') for li in list.find_all('li'): # print(img.attrs['src']) for a in li.children: if a.name == 'a': src = 'http://www.netbian.com' + a.attrs['href']
5 截取鏈接裏的數字做爲圖片的名稱(這裏能夠本身想怎麼弄就怎麼弄)
htm
n = re.search(r'\d+', a.attrs['href'])[0] # 這裏是\d+,而不是\d{5},是爲了不萬一只出現4個數字,則會報錯
6 到達真實圖片地址
blog
res = requests.get(src, headers={'User-Agent': ua}) s = BeautifulSoup(res.text, 'html.parser') p = s.find(name='p') # print(p) img = p.img.attrs['src'] # print(img) # 判斷地址是否爲空 if not img: continue
7 下載
with requests.get(img, headers={'User-Agent': ua}) as resp: # print(resp.status_code) resp.raise_for_status() resp.encoding = res.apparent_encoding # 將圖片內容寫入 with open('E://paper//{}.jpg'.format(n), 'wb') as f: f.write(resp.content) f.close()
8 若要下載全部的圖片
# 頁數循環 for i in range(1, 139): if i == 1: url = 'http://www.netbian.com/dongman/index.htm' else: url = 'http://www.netbian.com/dongman/index_{}.htm'.format(i) # print(url)
9 結果
注: 若會Xpath的話,用Xpath會比BeautifulSoup要簡單點,我本身是懶得改過去了。