1.今天給你們介紹本身寫的一個圖片爬蟲,說白了就是從網頁自動上下載須要的圖片html
2.首先選取目標爲:http://www.zhangzishi.cc/漲姿式這個網站以下圖,咱們的目標就是爬取該網站福利社的全部美圖app
3.福利社地址爲http://www.zhangzishi.cc/category/welfare,獲取圖片就是獲取全部網站圖片的url地址,首先網站
A.打開URL,獲取html代碼url
def url_open(url): req = urllib.request.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36') response = urllib.request.urlopen(req) html = response.read() print('url_open') return html
B.從html代碼中摘取網頁連接,返回的是一個列表spa
def page_htmls(url,count): html = url_open(url).decode('utf-8') pages = [] a = html.find('a target="_blank" href=') i = 0 while a != -1: i += 1 b = html.find('.html',a,a+200) if b != -1: pages.append(html[a+24:b+5]) else: b = a + 24 a = html.find('a target="_blank" href=',b) if i == count: break for each in pages: print(each) return pages
C.從每個連接頁中獲取圖片地址,我這用了兩種方法code
def find_imgs(url): html = url_open(url).decode('utf-8') imgs = [] a = html.find('img src=') while a != -1: b = html.find('.jpg',a,a+100) if b != -1: if html[a+9:b+4].find('http') == -1: imgs.append('http:'+html[a+9:b+4]) else: imgs.append(html[a+9:b+4]) else: b = a + 9 a = html.find('img src=',b) ''' for each in imgs: print(each) ''' return imgs def imgurl_get(url): html = url_open(url).decode('utf-8') imgurls = [] a = html.find('color: #555555;" src=') while a != -1: b = html.find('.jpg',a,a+100) if b != -1: imgurls.append('http:'+html[a+22:b+4]) else: b = a + 22 a = html.find('color: #555555;" src=',b) return imgurls
D.根據圖片url下載圖片到文件htm
def save_imgs(folder,imgs): for ea in imgs: filename = ea.split('/')[-1] with open(filename,'wb') as f: img = url_open(ea) f.write(img) def download_mm(folder='H:\\xxoo2',page_count = 100,count = 100): main_url = 'http://www.zhangzishi.cc/category/welfare' main_urls = [] for i in range(count): main_urls.append(main_url+'/page/'+str(i+1)) os.mkdir(folder) os.chdir(folder) for url in main_urls: htmls = page_htmls(url,page_count) for page in htmls: imgurls = imgurl_get(page) save_imgs(folder,imgurls)
E.開始下載blog
def download__img(folder='H:\\xxoo',page_count=100): main_url = 'http://www.zhangzishi.cc/category/welfare' os.mkdir(folder) os.chdir(folder) htmls = page_htmls(main_url,page_count) for page in htmls: imgs_url = find_imgs(page) save_imgs(folder,imgs_url) if __name__ == '__main__': download_mm() #download__img()
F:下載結果圖片
順便附上所有代碼:utf-8
import urllib.request import os def url_open(url): req = urllib.request.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36') response = urllib.request.urlopen(req) html = response.read() print('url_open') return html def page_htmls(url,count): html = url_open(url).decode('utf-8') pages = [] a = html.find('a target="_blank" href=') i = 0 while a != -1: i += 1 b = html.find('.html',a,a+200) if b != -1: pages.append(html[a+24:b+5]) else: b = a + 24 a = html.find('a target="_blank" href=',b) if i == count: break for each in pages: print(each) return pages ''' ''' def find_imgs(url): html = url_open(url).decode('utf-8') imgs = [] a = html.find('img src=') while a != -1: b = html.find('.jpg',a,a+100) if b != -1: if html[a+9:b+4].find('http') == -1: imgs.append('http:'+html[a+9:b+4]) else: imgs.append(html[a+9:b+4]) else: b = a + 9 a = html.find('img src=',b) ''' for each in imgs: print(each) ''' return imgs def imgurl_get(url): html = url_open(url).decode('utf-8') imgurls = [] a = html.find('color: #555555;" src=') while a != -1: b = html.find('.jpg',a,a+100) if b != -1: imgurls.append('http:'+html[a+22:b+4]) else: b = a + 22 a = html.find('color: #555555;" src=',b) return imgurls ''' for each in imgurls: print(each) ''' def save_imgs(folder,imgs): for ea in imgs: filename = ea.split('/')[-1] with open(filename,'wb') as f: img = url_open(ea) f.write(img) def download_mm(folder='H:\\xxoo2',page_count = 100,count = 100): main_url = 'http://www.zhangzishi.cc/category/welfare' main_urls = [] for i in range(count): main_urls.append(main_url+'/page/'+str(i+1)) os.mkdir(folder) os.chdir(folder) for url in main_urls: htmls = page_htmls(url,page_count) for page in htmls: imgurls = imgurl_get(page) save_imgs(folder,imgurls) def download__img(folder='H:\\xxoo',page_count=100): main_url = 'http://www.zhangzishi.cc/category/welfare' os.mkdir(folder) os.chdir(folder) htmls = page_htmls(main_url,page_count) for page in htmls: imgs_url = find_imgs(page) save_imgs(folder,imgs_url) if __name__ == '__main__': download_mm() #download__img()