下面是關於中間件的方法的介紹html
1 class MiddleproDownloaderMiddleware(object): 2 user_agent_list = [ 3 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " 4 "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 5 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " 6 "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 7 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " 8 "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 9 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " 10 "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 11 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " 12 "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 13 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " 14 "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 15 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " 16 "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 17 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 18 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 19 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " 20 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 21 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " 22 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 23 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 24 "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 25 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 26 "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 27 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 28 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 29 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 30 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 31 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " 32 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 33 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 34 "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 35 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " 36 "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 37 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " 38 "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 39 ] 40 # 可被選用的代理IP 41 PROXY_http = [ 42 '153.180.102.104:80', 43 '195.208.131.189:56055', 44 ] 45 PROXY_https = [ 46 '120.83.49.90:9000', 47 '95.189.112.214:35508', 48 ] 49 #攔截全部未發生異常的請求 50 def process_request(self, request, spider): 51 # Called for each request that goes through the downloader 52 # middleware. 53 54 # Must either: 55 # - return None: continue processing this request 56 # - or return a Response object 57 # - or return a Request object 58 # - or raise IgnoreRequest: process_exception() methods of 59 # installed downloader middleware will be called 60 61 #使用UA池進行請求的UA假裝 62 print('this is process_request') 63 request.headers['User-Agent'] = random.choice(self.user_agent_list) 64 print(request.headers['User-Agent']) 65 66 # #使用代理池進行請求代理ip的設置 67 # if request.url.split(':')[0] == 'http': 68 # request.meta['proxy'] = random.choice(self.PROXY_http) 69 # else: 70 # request.meta['proxy'] = random.choice(self.PROXY_https) 71 return None 72 #攔截全部的響應 73 def process_response(self, request, response, spider): 74 # Called with the response returned from the downloader. 75 76 # Must either; 77 # - return a Response object 78 # - return a Request object 79 # - or raise IgnoreRequest 80 return response 81 #攔截到產生異常的請求 82 def process_exception(self, request, exception, spider): 83 # Called when a download handler or a process_request() 84 # (from other downloader middleware) raises an exception. 85 86 # Must either: 87 # - return None: continue processing this exception 88 # - return a Response object: stops process_exception() chain 89 # - return a Request object: stops process_exception() chain 90 # 使用代理池進行請求代理ip的設置 91 print('this is process_exception!') 92 if request.url.split(':')[0] == 'http': 93 request.meta['proxy'] = random.choice(self.PROXY_http) 94 else: 95 request.meta['proxy'] = random.choice(self.PROXY_https)
當咱們要獲取的數據不在一個頁面時,例如新聞,獲取到標題以後點擊進入詳情頁面,獲取詳情信息app
爬蟲文件dom
1 class FirstSpider(scrapy.Spider): 2 name = 'first' 3 # allowed_domains = ['www.xxx.com'] 4 start_urls = ['http://war.163.com/'] 5 6 def parse_detail(self, response): 7 tag_list = [] 8 item = response.meta["item"] 9 a_list = response.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/a') 10 11 for a in a_list: 12 tag = a.xpath('./text()').extract_first() 13 tag_list.append(tag) 14 yield item 15 16 def parse(self, response): 17 li_list = response.xpath('//div[@class="today_news"]/ul/li') 18 for li in li_list: 19 item = MiddleItem() 20 title = li.xpath('./a/text()').extract_first() 21 item['title'] = title 22 print(item) 23 detail_url = li.xpath('./a/@href').extract_first()+'#p=E9DPI10E4T8E0001NOS' 24 print(detail_url) 25 yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item})
在每次獲取信息以後都會再一次的發送請求,這裏咱們要從新編寫def parse_detail來獲取詳情頁面的信息,而且將item做爲參數傳入scrapy