setting.pycss
#1 是否遵循爬蟲協議 ROBOTSTXT_OBEY = False #2 瀏覽器類型(默認寫的是scrapy,) # USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' #3 日誌級別(默認是info,執行爬蟲,info會被打印出來) # 設置成ERROR,只打印錯誤信息(提升爬蟲效率) LOG_LEVEL='ERROR' #4 使用本身的去重規則 DUPEFILTER_CLASS = 'filter.MyDupeFilter' #5 支持執行的最大併發請求默認是16個,默認開啓的併發線程是32個,能夠本身修改併發線程數 # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32
#保存到json文件中 1 scrapy crawl cnblogs -o cnblogs.json (這個不須要記) -1 在items中寫類,類中寫字段 class CnblogsSpiderItem(scrapy.Item): title = scrapy.Field() desc=scrapy.Field() url=scrapy.Field() author=scrapy.Field() # 重點(文章詳情,若是跟以前爬過的文章對應) content=scrapy.Field() -2 在爬蟲中把要保存的字段放到item對象中 article_item['url']=url article_item['title']=title article_item['desc']=desc article_item['author']=author yield article_item -3 在控制檯輸入:scrapy crawl cnblogs -o cnblogs.json -o表明輸出 cnblogs.json是輸出的文件名(輸出後會在項目根路徑多cnblogs.json文件) 2 經常使用方式,只記住這一種 -1 在items中寫類,類中寫字段 class CnblogsSpiderItem(scrapy.Item): title = scrapy.Field() desc=scrapy.Field() url=scrapy.Field() author=scrapy.Field() # 重點(文章詳情,若是跟以前爬過的文章對應) content=scrapy.Field() -2 在爬蟲中把要保存的字段放到item對象中 article_item['url']=url article_item['title']=title article_item['desc']=desc article_item['author']=author yield article_item -3 在setting中配置 ITEM_PIPELINES = { 'cnblogs_spider.pipelines.CnblogsSpiderFilePipeline': 300, # 數字表示優先級,數字越小,優先級越大. 'cnblogs_spider.pipelines.CnblogsSpiderMysqlPipeline': 400, # 數字表示優先級,數字越小,優先級越大 } -4 在pipline中寫 class CnblogsSpiderFilePipeline: # 爬蟲啓動他會執行 def open_spider(self,spider): # spider是爬蟲對象 print(spider.name) print('爬蟲開始了') self.f=open('cnblogs.txt','w',encoding='utf-8') def close_spider(self,spider): # 爬蟲中止會執行 print('爬蟲中止了') self.f.close() def process_item(self, item, spider): self.f.write(item['title']+item['desc']+item['author']+item['url']) self.f.write('/n') return item import pymysql class CnblogsSpiderMysqlPipeline: def open_spider(self,spider): self.conn=pymysql.connect( host='127.0.0.1', user='root', password="123",database='cnblogs', port=3306) self.cursor=self.conn.cursor() def close_spider(self,spider): self.conn.commit() self.cursor.close() self.conn.close() def process_item(self, item, spider): sql='insert into aritcle (title,`desc`,url,author) values (%s,%s,%s,%s )' self.cursor.execute(sql,args=[item['title'],item['desc'],item['url'],item['author']]) return item
1 給另外一個請求傳遞參數,在響應中拿到(藉助meta)
#Request(url,meta={'key':value}) yield Request(url=url,callback=self.parser_detail,meta={'item':article_item}) 2 在解析方法中經過response對象獲取
#response.meta.get('key') item=response.meta.get('item')
提升scrapy的爬取效率(異步框架,基於twisted,性能很高了,可是也有能夠優化的點): - 在配置文件中進行相關的配置便可:(默認還有一套setting,類比django) #1 增長併發(併發請求數): 默認scrapy開啓的併發線程爲32個,能夠適當進行增長。在settings配置文件中修改CONCURRENT_REQUESTS = 100值爲100,併發設置成了爲100。 CONCURRENT_REQUESTS = 100 #2 下降日誌級別: 在運行scrapy時,會有大量日誌信息的輸出,爲了減小CPU的使用率。能夠設置log輸出信息爲INFO或者ERROR便可。在配置文件中編寫:LOG_LEVEL = ‘INFO’ # 3 禁止cookie:(cnblogs不須要cookie) 若是不是真的須要cookie,則在scrapy爬取數據時能夠禁止cookie從而減小CPU的使用率,提高爬取效率。在配置文件中編寫:COOKIES_ENABLED = False # 4禁止/取消重試(爬失敗的網站就不要重試了): 對失敗的HTTP進行從新請求(重試)會減慢爬取速度,所以能夠禁止重試。在配置文件中編寫: RETRY_ENABLED = False # 5 減小下載超時間(超時時間設置短一些): 若是對一個很是慢的連接進行爬取,減小下載超時能夠能讓卡住的連接快速被放棄,從而提高效率。在配置文件中進行編寫: DOWNLOAD_TIMEOUT = 10 超時時間爲10s
#記住前5個就行
#6 加入cookie池
#7 加入代理池
#8 隨機請求頭(瀏覽器頭---》放到列表中 fake_useragent使用這個模塊)html
# 下載中間件,能夠有多個,數字越小,優先級越高 DOWNLOADER_MIDDLEWARES = { 'cnblogs_spider.middlewares.CnblogsSpiderDownloaderMiddleware': 543, }
1 請求來的時候 # Must either: # - return None: continue processing this request 返回None,進入下一個下載中間件的process_request # - or return a Response object 返回response對象,會給引擎,引擎給爬蟲,進入解析 # - or return a Request object 返回請求對象,會給引擎,引擎給調度器,放到調度器 # - or raise IgnoreRequest: process_exception() methods of 拋異常,就會觸發process_exception的執行 # 總結: 返回None,繼續爬取 返回Resoponse對象,會給引擎,給爬蟲,去解析 返回Request對象,會給引擎,給調度器,等待下一次被調度 # 什麼狀況會用它: 加代理,加cookie,加瀏覽器類型 集成 selenium #爲什麼要集成selenium? # 爬蟲scrapy框架基於Twisted 的框架,內部用的是協程。若是用selenium,它是阻塞式的,會影響效率,可是由於它有用因此selenium?在爬蟲中普遍使用 # 修改cookie request.cookies={'name':'lqz'} # 使用代理 proxy='http://154.16.63.16:8080' # 從代理池中獲取 request.meta["proxy"] =proxy # 修改請求頭 request.headers.setlist(b'User-Agent','asdfadsfadsf')
# Must either; # - return a Response object 正常邏輯,給 引擎,引擎個爬蟲去解析 # - return a Request object 返回Request對象,給引擎,引擎給調度器,等待下一次被調度 # - or raise IgnoreRequest 拋異常IgnoreRequest,響應不要了,不會給引擎,再給爬蟲解析 # 總結 返回Response,正常邏輯走 返回Request,放到調度器 # 什麼狀況會用到 拋異常,再也不解析該響應(用的也很少)
先本身隨便寫一個服務端,這裏寫的是flaskmysql
from flask import Flask app=Flask(__name__) app.secret_key='qazwsxedc' @app.route('/test') def test(): print(request.cookies) return 'xxxxxxxxx' if __name__ == '__main__': app.run()
客戶端 下載中間件中寫web
from scrapy import Request from scrapy.http.response.html import HtmlResponse # 下載中間件(介於下載器和引擎之間) class CnblogsSpiderDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s # 請求走的時候 def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called # 能拿到request對象(當次請求的地址),能拿到spider對象(當前爬蟲) # print('中間件中打印當次請求的地址',request.url) # 加cookie print(request.cookies) # 搭建好cookie池,從cookie池中取一個,賦值給它,換cookie了\ # request.cookies={'name':'lqz'} # 加代理(使用代理池) print(request.meta) # proxy = "http://" + get_proxy() # proxy='http://154.16.63.16:8080' # 從代理池中獲取 # request.meta["proxy"] = # 修改請求頭(token是放在請求頭中的,token池) # request.header['xx']='sssssssss' # 修改瀏覽器類型 # request.header 重寫了 setitem系列 # print(type(request.headers)) # print(request.headers) from scrapy.http.headers import Headers # print(request.headers.get(b'User-Agent')) request.headers.setlist(b'User-Agent','asdfadsfadsf')
#使用selenium比用原生的download效率低 1 在爬蟲中寫 class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['www.bilibili.com'] start_urls = ['https://www.bilibili.com/v/dance/otaku/#/all/default/0/1/'] bro=webdriver.Chrome(executable_path='./chromedriver') bro.implicitly_wait(10) @staticmethod def close(spider, reason): spider.bro.close() 2 在中間件中直接使用 class CnblogsSpiderDownloaderMiddleware: def process_request(self, request, spider): spider.bro.get(request.url) response=HtmlResponse(url=request.url,body=bytes(spider.bro.page_source,encoding='utf-8')) return response 3 如何隱藏瀏覽器? -用無頭瀏覽器
#爬蟲去重方式:redis
-方式一:scrapy默認的去重規則是:集合
-方式二:redis集合實現去重(自定義去重規則)
-方式三:使用布隆過濾器sql
1 默認會去重,使用了 from scrapy.dupefilters import RFPDupeFilter 2 在默認的setting中配的 3 自己原理是使用的是集合去重 4 更高級部分
-對地址進行了獲取指紋(爲了處理一樣地址,可是參數位置不同致使的兩個地址不一致)
127.0.0.1/name=lili&age=19&sex=malechrome
# 是否是同一個地址 127.0.0.1/?name=lili&age=19 127.0.0.1/?age=19&name=lili # 本質原理,把?後的打散了,再排序 fp = self.request_fingerprint(request) # 獲得一個指紋,上面那兩個地址結果同樣 ## 自定義去重規則 若是你本身寫去重類,如何使用? 寫一個類,繼承 BaseDupeFilter,重寫def request_seen(self, request): 返回true表示爬過了 返回false表示沒有爬過 # 嘗試用一個更牛逼的去重方案 -集合去重能夠,存在問題,若是地址特別多,上億個地址,集合會特別大,會很是佔用內存 -極小內存,完成去重(布隆過濾器)
# 使用本身的去重規則 settings.py配置 DUPEFILTER_CLASS = 'filter.MyDupeFilter' #fitter.py from scrapy.dupefilters import BaseDupeFilter class MyDupeFilter(BaseDupeFilter): pool=set() def request_seen(self, request): print('走了本身的') if request.url in self.pool: return True else: self.pool.add(request.url) return False
run.py #右鍵運行,可無數據庫
from scrapy.cmdline import execute # execute(['scrapy', 'crawl', 'cnblogs','--nolog']) # execute(['scrapy', 'crawl', 'cnblogs']) execute(['scrapy', 'crawl', 'chouti'])
filter.py #本身寫的去重規則,可無django
from scrapy.dupefilters import BaseDupeFilter class MyDupeFilter(BaseDupeFilter): pool=set() def request_seen(self, request): print('走了本身的') if request.url in self.pool: return True else: self.pool.add(request.url) return False
s1.py #檢驗是否爬重複了,可無json
from scrapy.utils.request import request_fingerprint from scrapy import Request #檢測是否會爬重複,這裏寫的是同一個地址,結論:說明獲取的指紋,不會爬重複 url1=Request(url='http://127.0.0.1/?name=lqz&age=19') url2=Request(url='http://127.0.0.1/?age=19&name=lqz') fp1=request_fingerprint(url1) fp2=request_fingerprint(url2) print(fp1)#afbaa0881bb50eb208caba3c4ac4aa8ffdbb7ba4 print(fp2)#afbaa0881bb50eb208caba3c4ac4aa8ffdbb7ba4
cnblogs_spider\settings.py
BOT_NAME = 'cnblogs_spider' SPIDER_MODULES = ['cnblogs_spider.spiders'] NEWSPIDER_MODULE = 'cnblogs_spider.spiders' # Obey robots.txt rules #1 是否遵循爬蟲協議 ROBOTSTXT_OBEY = False #2 瀏覽器類型(默認寫的是scrapy,) # USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' #3 日誌級別(默認是info,執行爬蟲,info會被打印出來) # 設置成ERROR,只打印錯誤信息(提升爬蟲效率) LOG_LEVEL='ERROR' # 使用本身的去重規則 DUPEFILTER_CLASS = 'filter.MyDupeFilter' #支持一次性發送多少個請求,默認是16個 # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # 下載中間件,能夠有多個,數字越小,優先級越高 DOWNLOADER_MIDDLEWARES = { 'cnblogs_spider.middlewares.CnblogsSpiderDownloaderMiddleware': 543, } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html # 持久化相關的配置 ITEM_PIPELINES = { # 'cnblogs_spider.pipelines.CnblogsSpiderFilePipeline': 300, # 數字表示優先級,數字越小,優先級越大 'cnblogs_spider.pipelines.CnblogsSpiderMysqlPipeline': 400, # 數字表示優先級,數字越小,優先級越大 }
cnblogs_spider\items.py
import scrapy # 類比models.py,寫一個個的類,必定要繼承scrapy.Item # 如何判斷一個對象是否是Item的對象 # isinstance 跟 type的區別是什麼 class CnblogsSpiderItem(scrapy.Item): title = scrapy.Field() desc=scrapy.Field() url=scrapy.Field() author=scrapy.Field() # 重點(文章詳情,若是跟以前爬過的文章對應) content=scrapy.Field()
cnblogs_spider\pipelines.py
from itemadapter import ItemAdapter # 這個類就是存儲相關,存到文件,存到數據庫 # 把爬取的數據保存到文件中 class CnblogsSpiderFilePipeline: # 爬蟲啓動他會執行 def open_spider(self,spider): # spider是爬蟲對象 print(spider.name) print('爬蟲開始了') self.f=open('cnblogs.txt','w',encoding='utf-8') def close_spider(self,spider): # 爬蟲中止會執行 print('爬蟲中止了') self.f.close() def process_item(self, item, spider): # 沒yield一個item會執行 # item 就是在爬蟲中yield的那個item # with open('cnblogs.txt','w',encoding='utf-8') as f: # f.write('標題是:'+item['title']) # f.write('摘要是:'+item['desc']) # f.write('做者是:'+item['author']) # f.write('鏈接是:'+item['url']) # print('來了一個itme') self.f.write(item['title']+item['desc']+item['author']+item['url']) self.f.write('/n') return item import pymysql class CnblogsSpiderMysqlPipeline: def open_spider(self,spider): self.conn=pymysql.connect( host='127.0.0.1', user='root', password="123",database='cnblogs', port=3306) self.cursor=self.conn.cursor() def close_spider(self,spider): self.conn.commit() self.cursor.close() self.conn.close() def process_item(self, item, spider): sql='insert into aritcle (title,`desc`,url,author,content) values (%s,%s,%s,%s,%s )' self.cursor.execute(sql,args=[item['title'],item['desc'],item['url'],item['author'],item['content']]) return item
cnblogs_spider\middlewares.py #注意:跟瀏覽器相關,須要在項目中加入適合本電腦的瀏覽器 chromedriver
from scrapy import signals from itemadapter import is_item, ItemAdapter # 爬蟲中間件(介於爬蟲和引擎之間) class CnblogsSpiderSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, or item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request or item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) from scrapy import Request from scrapy.http.response.html import HtmlResponse # 下載中間件(介於下載器和引擎之間) class CnblogsSpiderDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s # 請求走的時候 def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called # 能拿到request對象(當次請求的地址),能拿到spider對象(當前爬蟲) # print('中間件中打印當次請求的地址',request.url) # 加cookie print(request.cookies) # 搭建好cookie池,從cookie池中取一個,賦值給它,換cookie了\ # request.cookies={'name':'lqz'} # 加代理(使用代理池) print(request.meta) # proxy = "http://" + get_proxy() # proxy='http://154.16.63.16:8080' # 從代理池中獲取 # request.meta["proxy"] = # 修改請求頭(token是放在請求頭中的,token池) # request.header['xx']='sssssssss'
# 修改瀏覽器類型 # request.header 重寫了 setitem系列 # print(type(request.headers)) # print(request.headers) from scrapy.http.headers import Headers # print(request.headers.get(b'User-Agent')) # request.headers.setlist(b'User-Agent','asdfadsfadsf') # 集成selenium # from selenium import webdriver # bro=webdriver.Chrome(executable_path='./chromedriver') spider.bro.get(request.url) # bro.page_source # 包裝一個Response對象便可 # scrapy.http.response.html.HtmlResponse response=HtmlResponse(url=request.url,body=bytes(spider.bro.page_source,encoding='utf-8')) return response # 響應來的時候 def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response # 出異常的時候 # 在這裏把全部爬取失敗的地址,用request.url取出來而且存起來,下次再爬 def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass # 爬蟲開啓的時候 def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
cnblogs_spider\spiders\cnblogs.py #使用默認去重規則
import scrapy from .. import items # 相對導入只能用在包內,若是該文件以腳本運行,必須是絕對導入 # from cnblogs_spider import items from scrapy import Request class CnblogsSpider(scrapy.Spider): name = 'cnblogs' # 爬蟲名字(不能重複) # allowed_domains = ['www.cnblogs.com'] # 容許的域(只爬取當前域下的地址) allowed_domains = ['127.0.0.1:5000'] # 容許的域(只爬取當前域下的地址) start_urls = ['http://127.0.0.1:5000/test'] # 爬取的起始地址 # def parse(self, response): # 解析方法 # # response:響應對象(requests模塊的響應對象) # # print(response.text) # # response.xpath() # # response.css() # ll = [] # 持久化相關 # article_list=response.xpath('//article[@class="post-item"]') # for article in article_list: # article_item=items.CnblogsSpiderItem() # # title=article.xpath('.//a[@class="post-item-title"]/text()').extract_first() # desc=article.css('p.post-item-summary::text').extract()[-1] # # author=article.css('a.post-item-author>span::text').extract_first() # # author=article.css('footer.post-item-foot span::text').extract_first() # # > css # # // xpath # author=article.xpath('.//a[@class="post-item-author"]/span/text()').extract_first() # # url=article.xpath('.//a[@class="post-item-title"]/@href').extract_first() # url=article.css('a.post-item-title::attr(href)').extract_first() # # 持久化的第一種,講完就忘掉 # ll.append({'title':title,'desc':desc,'url':url}) # # print(url) # # callback解析方法 # yield Request(url,callback=self.parser_detail) # # # # 解析下一頁 # # next=response.xpath('//div[@class="pager"]/a[last()]/@href').extract_first() # next=response.css('div.pager a:last-child::attr(href)').extract_first()# print(next) # # yield Request(next) # return ll def parse(self, response): # 解析方法 print(type(response)) article_list = response.xpath('//article[@class="post-item"]') for article in article_list: article_item = items.CnblogsSpiderItem() author=article.css('a.post-item-autho>span::text').extract_first() author=article.css('footer.post-item-foot span::text').extract_first() title = article.xpath('.//a[@class="post-item-title"]/text()').extract_first() desc = article.css('p.post-item-summary::text').extract()[-1] author = article.xpath('.//a[@class="post-item-author"]/span/text()').extract_first() url = article.css('a.post-item-title::attr(href)').extract_first() # 往對象中放屬性(不能用.的方式,只能用[]) # article_item.url=url # article_item.title=title # article_item.desc=desc # article_item.author=author article_item['url']=url article_item['title']=title article_item['desc']=desc article_item['author']=author # print(title) # yield article_item yield Request(url=url,callback=self.parser_detail,meta={'item':article_item}) # yield Request(url, callback=self.parser_detail) next = response.css('div.pager a:last-child::attr(href)').extract_first() print(next) yield Request(next) def parser_detail(self,response): item=response.meta.get('item') # 獲取到標籤,把標籤直接轉成字符串直接存標籤(由於標籤會有樣式,而存爲text的話沒有樣式) content=response.css('#cnblogs_post_body').extract_first() # 把文章詳情放入 item['content']=str(content) yield item
cnblogs_spider\spiders\chouti.py #驗證本身寫的去重規則
import scrapy from scrapy.dupefilters import RFPDupeFilter # 默認用的去重類 # DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter' 在內置的配置文件中寫死的,若是你想更換,能夠本身寫一個類 from scrapy import Request from scrapy.http.request import Request from selenium import webdriver class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['www.bilibili.com'] #起始地址 start_urls = ['https://www.bilibili.com/v/dance/otaku/#/all/default/0/1/'] bro=webdriver.Chrome(executable_path='./chromedriver') bro.implicitly_wait(10) @staticmethod def close(spider, reason): spider.bro.close() # 真正的開始爬取的地址(看源碼),因此能夠不用start_urls,直接重寫start_requests # def start_requests(self): # yield Request('http://www.baidu.com') def parse(self, response): # print(response.text) li_list=response.css('ul.vd-list li') print(len(li_list)) for li in li_list: url=li.css('div.r>a::attr(href)').extract_first() print(url) # yield Request(url='https:'+url,callback=self.parser_detail) yield Request(url='https://www.bilibili.com/v/dance/otaku/#/all/default/0/1/') def parser_detail(self,response): print(len(response.text))