一:何謂動態加載圖片
php
所謂動態加載圖片即指html剛加載時,圖片時沒有的,而後經過json發生有關圖片的數據,在插入到html裏面去,以到底快速打開網頁的目的,那麼問題來了?咱們如何找到加載文件的json文件呢?而這個問題正是咱們實現爬取百度圖片的第一步,讓小可愛告訴你怎麼作吧。
html
咱們以表情包這個關鍵字爲例,以下圖python
小可愛在百度圖片搜索表情包來到上圖頁面,而後按F12來到開發者頁面,ajax
咱們點擊二個json文件 如:json
②https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E8%A1%A8%E6%83%85%E5%8C%85&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word=%E8%A1%A8%E6%83%85%E5%8C%85&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&pn=60&rn=30&gsm=3c&1523454876394=ide
兩個連接的第一處和第二處標紅處爲關鍵字word的編碼格式,能夠用urllib.parse.unquote()解碼,第二個爲每次涮新的步長,計算爲rm+上一個pnui
咱們在看看連接的內容吧編碼
這裏你只要記住咱們要找的圖片連接爲objURL就行,能夠經過re模塊compile與find_all方法找出這個json文件的全部objurl,有了這些objurl,到了這裏咱們就離成功爬取百度圖片不遠了。ps因爲百度圖片的objurl加密了,因此解密過程就不解釋了。加密
接着小可愛便附上咱們的完整代碼
import json import itertools import urllib import requests import os import re import sys word=input("請輸入關鍵字:") path="F:/maps" if not os.path.exists(path): os.mkdir(path) word=urllib.parse.quote(word) #該URL地址不是網頁的URL地址,而JSON地址 url = r"http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&st=-1&ic=0&word={word}&face=0&istype=2nc=1&pn={pn}&rn=60" urls=[url.format(word=word,pn=x*60)for x in range(0,100)] index=0 str_table = { '_z2C$q': ':', '_z&e3B': '.', 'AzdH3F': '/' } char_table = { 'w': 'a', 'k': 'b', 'v': 'c', '1': 'd', 'j': 'e', 'u': 'f', '2': 'g', 'i': 'h', 't': 'i', '3': 'j', 'h': 'k', 's': 'l', '4': 'm', 'g': 'n', '5': 'o', 'r': 'p', 'q': 'q', '6': 'r', 'f': 's', 'p': 't', '7': 'u', 'e': 'v', 'o': 'w', '8': '1', 'd': '2', 'n': '3', '9': '4', 'c': '5', 'm': '6', '0': '7', 'b': '8', 'l': '9', 'a': '0' } i=1 char_table = {ord(key): ord(value) for key, value in char_table.items()} for url in urls: html=requests.get(url,timeout=10).text #設置編譯格式 a=re.compile(r'"objURL":"(.*?)"') downURL=re.findall(a,html) for t in downURL: #解碼 for key, value in str_table.items(): t = t.replace(key, value) t=t.translate(char_table) try: html_1=requests.get(t) if str(html_1.status_code)[0]=="4": print('失敗1') continue except Exception as e: print('失敗2') continue #下載 with open(path+"/"+str(i)+".jpg",'wb') as f: f.write(html_1.content) i=i+1
下面兩個分別是小可愛本身原創編寫的爬取蜂鳥網和匯圖網的爬蟲代碼,
匯圖網爬蟲;
import re import requests import os import urllib header= {'content-type': 'application/json', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'} url="http://soso.huitu.com/Search/GetAllPicInfo?perPageSize=102&kw={word}&page={num}" word=input("請輸入關鍵字:") word=urllib.parse.quote(word) urls=[str(url).format(word=word,num=x)for x in range(1,2)] i=1 for url in urls: print(url) html=requests.get(url).text print(html) r=re.compile(r'"imgUrl":"(.*?)"') u=re.findall(r,html) for s in u: htmls=requests.get(s) with open("F:\\im\\"+str(i)+".jpg",'wb')as f: f.write(htmls.content) i=i+1
蜂鳥網爬蟲:
import re import requests import os import itertools url="http://photo.fengniao.com/ajaxPhoto.php?action=getPhotoLists&fid=595&sort=0&page={num}" i=1 path="F:\\fengniao" if not os.path.exists(path): os.mkdir(path) urls = [url.format(num=x) for x in range(1,100)] for url in urls: html = requests.get(url).text r=re.compile(r'"image":"(.*?)"') u=re.findall(r, html) for downurl in u: downurl=str(downurl).replace("\\","").split("?")[0] htmls=requests.get(downurl) with open(path+"\\"+str(i)+".jpg",'wb') as f: f.write(htmls.content) i=i+1 print(downurl)