語法:item_completed(results, items, info)
;css
當一個單獨項目中的全部圖片請求完成時(無論下載成功或者失敗),ImagesPipeline.item_completed()
方法將被調用。item_completed()
方法必須返回將發送到後續item pipeline階段的輸出,所以必須返回或刪除item(默認狀況下item_completed會返回所有item);html
class ImagePipeline(ImagesPipeline): def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, ' 'please use file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from image_key or file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() or image_key() methods have been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) elif not hasattr(self.image_key, '_base'): _warn() return self.image_key(url) ## end of deprecation warning block image_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation # 修改成時間爲目錄 return '{}/{}.jpg'.format(datetime.now().year,image_guid) def item_completed(self, results, item, info): # 獲取圖片地址保存到列表中 values = [value['path'] for ok, value in results if ok] # 給item賦值 item['image_path'] = values.pop(0) if values else 'default.jpg' return item
hexdigest()
提取摘要。還能夠使用isinstance()
來判判斷傳入值編碼類型,使用encode()
方法將unicode編碼轉換成其餘編碼的字符串等;from hashlib import md5 def get_md5(url): if isinstance(url, str): # 先轉化爲字節碼 url = url.encode() print(url) obj = md5() obj.update(url) return obj.hexdigest() if __name__ == '__main__': print(get_md5('www.baidu.com'))
import scrapy class XkdDribbbleSpiderItem(scrapy.Item): title = scrapy.Field() image_url = scrapy.Field() date = scrapy.Field() # 添加圖片路徑到item中 image_path = scrapy.Field() # 加頁面的url地址添加到item中 url = scrapy.Field() # 添加url的哈希值字段 url_id = scrapy.Field()
import scrapy from urllib import parse from scrapy.http import Request from datetime import datetime from ..items import XkdDribbbleSpiderItem from ..utils.md5_tool import get_md5 class DribbbleSpider(scrapy.Spider): name = 'dribbble' allowed_domains = ['dribbble.com'] start_urls = ['https://dribbble.com/stories'] def parse(self, response): # 獲取a標籤的url值 # selector a_selectors = response.css('div.teaser a') for a_selector in a_selectors: image_url = a_selector.css('img::attr(src)').extract()[0] page_url = a_selector.css('::attr(href)').extract()[0] yield Request(url=parse.urljoin(response.url, page_url), callback=self.parse_analyse,meta={'a_image_url': image_url}) def parse_analyse(self, response): title = response.css('header h1::text').extract_first() image_url = response.meta.get('a_image_url') date_raw = response.css('p span.date::text').extract()[0] date_str = date_raw.strip() date = datetime.strptime(date_str, '%b %d, %Y').date() item = XkdDribbbleSpiderItem() item['title'] = title item['image_url'] = [image_url] item['date'] = date item['url'] = response.url item['url_id'] = get_md5(response.url) # item數據模型進行落地,數據持久化 yield item
import codecs import json class JsonSavePipeline: def process_item(self, item, spider): # 將spider中返回的item轉化爲字典 file = codecs.open('blog.json', mode='a') dict_item = dict(item) # 將字典json化 line = json.dumps(dict_item, ensure_ascii=False) + '\n' # 寫入到文件 file.write(line) # 再次返回item file.close()
'XKD_Dribbble_Spider.pipelines.JsonSavePipeline': 2,
參考:https://www.9xkd.com/user/plan-view.html?id=1871857091json