文檔地址:http://scrapy-chs.readthedocs.io/zh_CN/latest/topics/images.htmlhtml
實踐例子: 目的:抓取http://www.hlhua.com/頁面裏面商品的圖片git
import scrapy class MyItem(scrapy.Item): image_urls = scrapy.Field() image_paths = scrapy.Field() images = scrapy.Field()
import scrapy from scrapy.exceptions import DropItem from scrapy.pipelines.images import ImagesPipeline class MyImageDownloadPipeLine(ImagesPipeline): def get_media_requests(self, item, info): for image_url in item['image_urls']: yield scrapy.Request(image_url) def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") item['image_paths'] = image_paths return item
這裏重寫的item_completed用來在下載完成後保存image_path屬性 3. 編輯settings.py使能MyImageDownloadPipeLine settings.pygithub
# coding=utf-8 BOT_NAME = 'imagedemo' SPIDER_MODULES = ['imagedemo.spiders'] NEWSPIDER_MODULE = 'imagedemo.spiders' # 使能ImagePipeLine ITEM_PIPELINES = {'imagedemo.pipelines.MyImageDownloadPipeLine': 1} # 指定圖片文件保存的未知 IMAGES_STORE = 'image' ROBOTSTXT_OBEY = True
# coding=utf-8 from scrapy.spiders import Spider from imagedemo.items import MyItem class ImageSpider(Spider): name = 'hlhua' start_urls = ['http://www.hlhua.com/'] def parse(self, response): # inspect_response(response, self) images = [] for each in response.xpath("//img[@class='goodsimg']/@src").extract(): m = MyItem() m['image_urls'] = [each,] images.append(m) return images
github: https://github.com/chenglp1215/scrapy_demo/tree/master/imagedemojson