爬取妹紙網,重點是加入正則表達式
'Referer':'http://www.mzitu.com/'
import requests import re import time import random headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.33 Safari/537.36' ,'Referer':'http://www.mzitu.com/'} #headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.33 Safari/537.36'} session = requests.session() session.keep_alive = False for a in range(1,10): url = 'https://www.mzitu.com/149482/'+str(a) #url ='https://www.mzitu.com/zipai/comment-page-'+str(1)+'/#comments' data = requests.get(url,headers=headers).text #正則表達式 #photo = r'<.*?class="lazy".*?src=".*?".*?data-original="(.*?)".*?width=.*?>'#清純妹紙表達式 photo = r'<p>.*?<.*?src=.*?"(.*?)".*?alt=.*?width=.*?>' #name = r'<.*?class="comment-meta commentmetadata"><.*?(href=.*?)></div>' photo_url=re.findall(photo,data,re.S)#正則,源代碼,規則 print(photo_url) #title_name = re.findall(name,data,re.S) time.sleep(2) for i,b in enumerate(photo_url): header = {'Referer':url} response = requests.get(photo_url[i],headers = headers) print(response) print("正在下載第%s張 "%(a)) with open('{}.jpg'.format(a),'wb') as f: f.write(response.content) time.sleep(2)
加入 Referer:目的是請求時,告訴網站從哪裏進來的session