spider 只須要獲得圖片的url,必須以列表的形式給管道處理app
class PictureSpiderSpider(scrapy.Spider): name = 'picture_spider' allowed_domains = ['tieba.baidu.com'] start_urls = ['https://tieba.baidu.com/f?kw=%E5%8A%A8%E6%BC%AB%E5%A3%81%E7%BA%B8'] def parse(self, response): # 貼吧中一頁帖子的ID和標題 theme_urls = re.findall(r'<a rel="noreferrer" href="/p/(\d+)" title="(.*?)" target="_blank" class="j_th_tit ">', response.text, re.S) for theme in theme_urls: # 帖子的url theme_url = 'https://tieba.baidu.com/p/' + theme[0] # 進入各個帖子 yield scrapy.Request(url=theme_url, callback=self.parse_theme) # 貼吧下一頁的url next_url = re.findall( r'<a href="//tieba.baidu.com/f\?kw=%E5%8A%A8%E6%BC%AB%E5%A3%81%E7%BA%B8&ie=utf-8&pn=(\d+)" class="next pagination-item " >下一頁></a>', response.text, re.S) if next_url: next_url = self.start_urls[0] + '&pn=' + next_url[0] yield scrapy.Request(url=next_url) # 下載每一個帖子裏的全部圖片 def parse_theme(self, response): item = PostBarItem() # 每一個貼子一頁圖片的縮略圖的url pic_ids = response.xpath('//img[@class="BDE_Image"]/@src').extract() # 用列表來裝圖片的url item['pic_urls'] = [] for pic_url in pic_ids: # 取出每張圖片的名稱 item['pic_name'] = pic_url.split('/')[-1] # 圖片URL url = 'http://imgsrc.baidu.com/forum/pic/item/' + item['pic_name'] # 將url添加進列表 item['pic_urls'].append(url) # 將item交給pipelines下載 yield item # 下完一頁圖片後繼續下一頁 next_url = response.xpath('//a[contains(text(),"下一頁")]/@href').extract_first() if next_url: yield scrapy.Request('https://tieba.baidu.com' + next_url, callback=self.parse_theme)
from scrapy.pipelines.images import ImagesPipeline import scrapy class PostBarPipeline(ImagesPipeline): # 須要headers的網站,再使用 headers = { 'User-Agent': '', 'Referer': '', } def get_media_requests(self, item, info): for pic_url in item['pic_urls']: # 爲每一個url生成一個Request yield scrapy.Request(pic_url) # 須要請求頭的時候,添加headers參數 # yield scrapy.Request(pic_url, headers=self.headers) def file_path(self, request, response=None, info=None): # 重命名(包含後綴名),若不重寫這函數,圖片名爲哈希 pic_path = request.url.split('/')[-1] return pic_path
激活管道
dom
設置圖片保存地址
scrapy