最近學習爬蟲,簡單記錄下。python
#!/usr/bin/env python # -*- coding:utf-8 -*- from bs4 import BeautifulSoup from urllib import request import requests import os,time web_url = 'http://www.haopic.me' urls = [] img_urls = [] save_path = 'E:\\haopic\\' def get_urls(url): web_data = requests.get(url) soup = BeautifulSoup(web_data.text,'lxml') ur = soup.select('ul.item_list div.inner_item_box h2 > a') for url in ur: urls.append(url.get('href')) return urls def get_img_urls(img_url): web_data = requests.get(img_url) soup = BeautifulSoup(web_data.text,'lxml') title = soup.title.text.strip() image_urls = soup.select('div.content-c img') if os.path.exists('E:\\haopic\\{}'.format(title)): print('{}目錄已經存在'.format(title)) return else: os.mkdir('E:\\haopic\\{}'.format(title)) print(title) i = 0 for img in image_urls: imgurl = img.get('src') try: pic = requests.get(imgurl, timeout=5) # 超時異常判斷 5秒超時 except requests.exceptions.ConnectionError: print('當前圖片沒法下載') continue img_name = "image" + str(i) + ".jpg" # 拼接圖片名 print(img_name) # 將圖片存入本地 with open('{}\\{}\\{}'.format(save_path,title,img_name), 'wb') as fp: fp.write(pic.content) # 寫入圖片 i += 1 if __name__ == '__main__': for img_url in get_urls(web_url): get_img_urls(img_url) time.sleep(5)