步驟html
1 scrapy引擎來爬蟲中取起始的url: 2 1.調用start_requests並獲取返回值 3 2.v = iter(返回值) 4 3. 5 req1 = 執行v.__next__() 6 req2 = 執行v.__next__() 7 req3 = 執行v.__next__() 8 4.所有放到調度器中
編寫redis
1 class ChoutiSpider(scrapy.Spider): 2 name = 'chouti' 3 # 爬取定向的網頁 只容許這個域名的 4 allowed_domains = ['chouti.com'] 5 start_urls = ['https://dig.chouti.com/'] 6 cookie_dict = {} 7 8 def start_requests(self): 9 # 方式1 10 # for url in self.start_urls: 11 # yield Request(url=url) 12 # 方式2 13 # req_list = [] 14 # for url in self.start_urls: 15 # req_list.append(Request(url=url)) 16 # return req_list 17 pass
用到的知識緩存
可迭代對象或者生成器直接iter方法變成迭代器,之後定製start_urls的時候能夠本身直接發post請求,內置默認用的get方法,拿url也能夠到緩存redis中拿。cookie
源碼部分:app
源碼流程分析dom
結合個人這篇博文深度http://www.javashuo.com/article/p-gixrsyne-dx.htmlscrapy
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from wyb.items import WybItem 4 from scrapy.dupefilters import RFPDupeFilter 5 from scrapy.http.response.html import HtmlResponse 6 from scrapy.http.cookies import CookieJar 7 from urllib.parse import urlencode 8 from scrapy.http import Request 9 10 11 class ChoutiSpider(scrapy.Spider): 12 name = 'chouti' 13 # 爬取定向的網頁 只容許這個域名的 14 allowed_domains = ['chouti.com'] 15 start_urls = ['https://dig.chouti.com/'] 16 17 def start_requests(self): 18 import os 19 # 代理設置 downloadmiddleware中httppxoxy 20 os.environ['HTTP_PROXY'] = '1.1.1.2' 21 os.environ['HTTPS_PROXY'] = 'http://root:woshinizuzong@192.168.10.10:8888/' 22 # 方式1 23 for url in self.start_urls: 24 yield Request(url=url) 25 # 方式2 26 # req_list = [] 27 # for url in self.start_urls: 28 # req_list.append(Request(url=url)) 29 # return req_list 30 31 def parse(self, response): 32 """ 33 第一次訪問響應response 34 :param response: 35 :return: 36 """ 37 from scrapy.spidermiddlewares.depth import DepthMiddleware 38 from scrapy.http import Response 39 # response.request 40 # response.request.meta 此時None 41 print(response.meta.get('depth', 0)) 42 # response.request.meta['depth'] = 0 43 44 page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract() 45 for page in page_list: 46 from scrapy.http import Request 47 page = "https://dig.chouti.com"+page 48 # 繼續發請求,回調函數parse 49 yield Request(url=page, callback=self.parse, meta={"proxy": "http://root:woshinizuzong@192.168.10.10:8888/"})
源碼看下ide
總結: 函數
- 深度 最開始是0每次在yield原來請求中的depth自加1
- 優先級 -= 深度*配置文件優先級默認0post