mzt.pyhtml
# -*- coding: utf-8 -*- import urllib import scrapy from copy import deepcopy class MztSpider(scrapy.Spider): name = 'mzt' allowed_domains = ['www.mmjpg.com'] start_urls = ['http://www.mmjpg.com/'] def parse(self, response): url_list = response.xpath('/html/body/div[2]/div[1]/ul//li/span[1]/a') for url in url_list: # print(url) item = {} item['detail_url'] = url.xpath('./@href').extract_first() item['title'] = url.xpath('./text()').extract_first() print(item['detail_url'], item['title']) yield scrapy.Request( item['detail_url'], callback=self.parse_detail, meta={"item": item} ) # 翻頁 next_page = response.xpath('//a[contains(text(),"下一頁")]/@href').extract_first() print(next_page) if next_page is not None: next_page = urllib.parse.urljoin(response.url, next_page) yield scrapy.Request( next_page, callback=self.parse, meta={"item": item} ) # 解析詳情頁 def parse_detail(self, response): item = response.meta['item'] item['img_urls'] = [] # print(item) item['total'] = response.xpath('//*[@id="page"]/a[last()-1]/text()').extract_first() # print(item['total']) # one_img =response.xpath('//*[@id="content"]/a/img/@src').extract_first() for i in range(1, int(item['total']) + 1): detail_img = response.url + "/" + str(i) yield scrapy.Request( detail_img, callback=self.parse_img, meta={"item": item} ) # 獲取每一個標題下全部圖片url def parse_img(self, response): item = response.meta['item'] item['img_url'] = response.xpath('//*[@id="content"]/a/img/@src').extract_first() item['img_urls'].append(item['img_url']) return item
pipelines.pyapp
#重寫file_path
,由於file_path
只返回一個字符串,後續的處理還在ImagesPipeline類的其餘函數裏。dom
class MyImagesPipeline(ImagesPipeline): def get_media_requests(self, item, info): for image_url in item['img_urls']: referer=image_url # 處理防盜鏈 yield Request(image_url, meta={'item': item,'referer':referer})#配合傳遞,否則拿不到的.不會下載 def file_path(self, request, response=None, info=None): item = request.meta['item'] folder = item['title'] folder_strip = folder.strip() image_guid = request.url.split('/')[-1] filename = u'imge/{0}/{1}'.format(folder_strip, image_guid) return filename def item_completed(self, results, item, info): image_path = [x['path'] for ok, x in results if ok] if not image_path: raise DropItem('Item contains no images') # item['image_paths'] = image_path return itemget_media_requestsmetaitem
setting.pyscrapy
DOWNLOADER_MIDDLEWARES = { 'meizitu.middlewares.MeizituDownloaderMiddleware': 543, } IMAGES_STORE=r'C:\study\meizitu\meizitu' #指定文件下載路徑 # 圖像管道避免下載最近已經下載的圖片。使用 FILES_EXPIRES (或 IMAGES_EXPIRES) 設置能夠調整失效期限, # 能夠用天數來指定 IMAGES_EXPIRES = 30 ITEM_PIPELINES = { 'meizitu.pipelines.MyImagesPipeline': 2, }
middleware.py
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called #來處理圖片下載的防盜鏈 referer = request.meta.get('referer', None) if referer: request.headers['referer'] = referer