關於scrapy裏的中間件和請求傳參

時間 2019-11-26

原文原文鏈接

1、中間件

scrapy中間件事介於下載器和scrapy引擎之間，主要是接收和發送響應和請求

下面是關於中間件的方法的介紹html

 1 class MiddleproDownloaderMiddleware(object):
 2     user_agent_list = [
 3         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
 4         "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
 5         "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
 6         "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
 7         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
 8         "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
 9         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
10         "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
11         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
12         "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
13         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
14         "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
15         "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
16         "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
17         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
18         "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
19         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
20         "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
21         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
22         "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
23         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
24         "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
25         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
26         "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
27         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
28         "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
29         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
30         "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
31         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
32         "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
33         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
34         "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
35         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
36         "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
37         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
38         "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
39     ]
40     # 可被選用的代理IP
41     PROXY_http = [
42         '153.180.102.104:80',
43         '195.208.131.189:56055',
44     ]
45     PROXY_https = [
46         '120.83.49.90:9000',
47         '95.189.112.214:35508',
48     ]
49    #攔截全部未發生異常的請求
50     def process_request(self, request, spider):
51         # Called for each request that goes through the downloader
52         # middleware.
53 
54         # Must either:
55         # - return None: continue processing this request
56         # - or return a Response object
57         # - or return a Request object
58         # - or raise IgnoreRequest: process_exception() methods of
59         #   installed downloader middleware will be called
60 
61         #使用UA池進行請求的UA假裝
62         print('this is process_request')
63         request.headers['User-Agent'] = random.choice(self.user_agent_list)
64         print(request.headers['User-Agent'])
65 
66         # #使用代理池進行請求代理ip的設置
67         # if request.url.split(':')[0] == 'http':
68         #     request.meta['proxy'] = random.choice(self.PROXY_http)
69         # else:
70         #     request.meta['proxy'] = random.choice(self.PROXY_https)
71         return None
72     #攔截全部的響應
73     def process_response(self, request, response, spider):
74         # Called with the response returned from the downloader.
75 
76         # Must either;
77         # - return a Response object
78         # - return a Request object
79         # - or raise IgnoreRequest
80         return response
81     #攔截到產生異常的請求
82     def process_exception(self, request, exception, spider):
83         # Called when a download handler or a process_request()
84         # (from other downloader middleware) raises an exception.
85 
86         # Must either:
87         # - return None: continue processing this exception
88         # - return a Response object: stops process_exception() chain
89         # - return a Request object: stops process_exception() chain
90         # 使用代理池進行請求代理ip的設置
91         print('this is process_exception!')
92         if request.url.split(':')[0] == 'http':
93             request.meta['proxy'] = random.choice(self.PROXY_http)
94         else:
95             request.meta['proxy'] = random.choice(self.PROXY_https)

View Code

2、請求傳參

當咱們要獲取的數據不在一個頁面時，例如新聞，獲取到標題以後點擊進入詳情頁面，獲取詳情信息app

爬蟲文件dom

 1 class FirstSpider(scrapy.Spider):
 2     name = 'first'
 3     # allowed_domains = ['www.xxx.com']
 4     start_urls = ['http://war.163.com/']
 5 
 6     def parse_detail(self, response):
 7         tag_list = []
 8         item = response.meta["item"]
 9         a_list = response.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/a')
10 
11         for a in a_list:
12             tag = a.xpath('./text()').extract_first()
13             tag_list.append(tag)
14         yield item
15 
16     def parse(self, response):
17         li_list = response.xpath('//div[@class="today_news"]/ul/li')
18         for li in li_list:
19             item = MiddleItem()
20             title = li.xpath('./a/text()').extract_first()
21             item['title'] = title
22             print(item)
23             detail_url = li.xpath('./a/@href').extract_first()+'#p=E9DPI10E4T8E0001NOS'
24             print(detail_url)
25             yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item})