Scrapy 提供了專門下載文件或者圖片的Pipeline,下載圖片與文件的原理同抓取網頁的原理是同樣的,因此他們的下載過程支持多線程與異步,十分的高效html
參考連接python
首先在settings中配置圖片存放路徑數據庫
IMAGES_STORE = './images'
在item中定義須要的數據結構json
class Images360Item(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() collection = table = "images" id = scrapy.Field() title = scrapy.Field() url = scrapy.Field() thumb = scrapy.Field()
定義spider與parsesegmentfault
import scrapy from urllib.parse import urlencode from scrapy import Request from images360.images360.items import Images360Item class ImagesSpider(scrapy.Spider): name = 'images' allowed_domains = ['images.so.com'] start_urls = ['http://images.so.com/'] def start_requests(self): data = {'ch': 'photography', 'listtype': 'hot', } base_url = 'http://images.so.com/zj?' for page in range(1, self.settings.get('MAX_PAGE_SIZE') + 1): sn = page * 30 data['sn'] = sn params = urlencode(data) url = base_url + params print(url) yield Request(url, self.parse) def parse(self, response): html = json.loads(response.text) datas = html.get('list', '') if datas: for data in datas: images_item = Images360Item() images_item['id'] = data.get('imageid', '') images_item['title'] = data.get('group_title', '') images_item['url'] = data.get('qhimg_url', '') images_item['thumb'] = data.get('qhimg_thumb_url', '') yield images_item
定義項目管道數據結構
from scrapy import Request from scrapy.exceptions import DropItem from scrapy.pipelines.images import ImagesPipeline class ImagesPipeline(ImagesPipeline): # 將item中的url取出來 經過Request繼續放入到調度器中執行 def get_media_requests(self, item, info): yield Request(item['url']) # request對應的是當前下載對象,該函數用於放回 文件名 def file_path(self, request, response=None, info=None): url = request.url print('url============', url) file_name = url.split('/')[-1] return file_name # 單個item完成下載時的處理方法 def item_completed(self,results,item,info): # results爲Item對應的下載結果 # print(results) # [(True, {'url': 'http://p2.so.qhimgs1.com/t01b866193d9b2101de.jpg', 'path': 't01b866193d9b2101de.jpg', # 'checksum': 'e074b5cbacd22ac38480d84506fedf02'})] image_path = [x['path'] for ok,x in results if ok] if image_path: return item else: raise DropItem('image download failed')
注:ImagePipeline的優先級別應該比存入數據庫的級別高多線程