5.代碼書寫請求-全棧數據爬取
例子4:爬取全部頁面choutiAll--手動請求發送形式start_urls = ['https://dig.chouti.com/r/pic/hot/1']
解析抽屜圖片下全部的超鏈!
#設計了一個全部頁碼通用的url(pageNum表示的就是不一樣頁碼)
url = 'https://dig.chouti.com/r/pic/hot/%d'
重點是parse方法的調用yield scrapy.Request(url=url,callback=self.parse)
# -*- coding: utf-8 -*- import scrapy from choutiAllPro.items import ChoutiallproItem class ChoutiSpider(scrapy.Spider): name = 'chouti' #allowed_domains = ['www.ddd.com'] start_urls = ['https://dig.chouti.com/r/pic/hot/1'] #設計了一個全部頁碼通用的url(pageNum表示的就是不一樣頁碼) url = 'https://dig.chouti.com/r/pic/hot/%d' pageNum = 1 def parse(self, response): div_list = response.xpath('//div[@class="content-list"]/div') for div in div_list: title = div.xpath('./div[3]/div[1]/a/text()').extract_first() item = ChoutiallproItem() item['title']=title yield item #進行其餘頁碼對應url的請求操做 if self.pageNum <= 120: #假設只有120個頁碼 self.pageNum += 1 url = format(self.url%self.pageNum) #print(url) #進行手動請求的發送 yield scrapy.Request(url=url,callback=self.parse) #yield共發送頁碼的次數,無yield只發一次!parse被遞歸的調用
//text獲取多個文本內容 /text獲取單個文本內容
scarpy框架會自動處理get請求的cookie
例子5:百度翻譯--發post請求--處理cookie--postPro
修改父類方法:
def start_requests(self):
for url in self.start_urls:
#該方法能夠發起一個post請求
yield scrapy.FormRequest(url=url,callback=self.parse,formdata={'kw':'dog'})
# -*- coding: utf-8 -*- import scrapy #需求:對start_urls列表中的url發起post請求 class PostSpider(scrapy.Spider): name = 'post' #allowed_domains = ['www.xxx.com'] start_urls = ['https://fanyi.baidu.com/sug'] #Spider父類中的一個方法:能夠將 start_urls列表中的url一次進行請求發送 def start_requests(self): for url in self.start_urls: # yield scrapy.Request(url=url, callback=self.parse) #默認發get請求 #該方法能夠發起一個post請求 yield scrapy.FormRequest(url=url,callback=self.parse,formdata={'kw':'dog'}) #formdata處理攜帶的參數 def parse(self, response): print(response.text) #結果爲json串
例子6:登陸操做(登陸豆瓣電影),發post請求---loginPro
登陸便可獲取cookie
# -*- coding: utf-8 -*- import scrapy class LoginSpider(scrapy.Spider): name = 'login' #allowed_domains = ['www.xxx.com'] start_urls = ['https://accounts.douban.com/login'] def start_requests(self): data = { 'source': 'movie', 'redir': 'https://movie.douban.com/', 'form_email': '15027900535', 'form_password': 'bobo@15027900535', 'login': '登陸', } for url in self.start_urls: yield scrapy.FormRequest(url=url,callback=self.parse,formdata=data) def getPageText(self,response): page_text = response.text with open('./douban.html','w',encoding='utf-8') as fp: fp.write(page_text) print('over') def parse(self, response): #對當前用戶的我的主頁頁面進行獲取(有用戶信息說明攜帶cookie,不然是登陸界面) url = 'https://www.douban.com/people/185687620/' yield scrapy.Request(url=url,callback=self.getPageText)
6.scrapy核心組件--5大核心組件
總結流程描述:
引擎調用爬蟲文件中的start_requests方法,將列表中url封裝成請求對象(start_urls、yield中的),會有一系列的請求對象,引擎將請求對象給調度器,調度器會進行去重,
請求對象放在調度器的隊列中,調度器將請求對象調度給下載器,下載器拿着請求對象到互聯網中下載,頁面數據下載完後給下載器,下載器給爬蟲文件,
爬蟲文件進行解析(調用parse方法),將解析後的數據封裝到item對象中,提交給管道,管道進行持久化存儲。
注意:調度器中隊列,調度器對請求對象有去重功能。
1.引擎:全部方法的調用
2.調度器:接收引擎發送的請求,壓入到隊列中,去除重複網址
3.下載器:下載頁面內容,將下載好的頁面內容返回給蜘蛛(scrapy,就是爬蟲文件)
4.爬蟲文件(spiders):幹活的,將獲取的頁面數據進行解析操做
5.管道:進行持久化存儲
互聯網
下載中間件(介於調度器、引擎、爬蟲文件和下載器的中間):可進行代理ip的更換
例子7:代理中間件的應用----dailiPro
daili.py的書寫;middlewares.py中DailiproDownloaderMiddleware下process_request方法
def process_request(self, request, spider):
#request參數表示的就是攔截到的請求對象
request.meta['proxy'] = "https://151.106.15.3:1080"
return None
在settings中DOWNLOADER_MIDDLEWARES開啓 55-57行
# -*- coding: utf-8 -*- import scrapy class DailiSpider(scrapy.Spider): name = 'daili' #allowed_domains = ['www.xxx.com'] start_urls = ['https://www.baidu.com/s?wd=ip'] def parse(self, response): page_text = response.text with open('daili.html','w',encoding='utf-8') as fp: fp.write(page_text)
# -*- coding: utf-8 -*- from scrapy import signals class DailiproDownloaderMiddleware(object): @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # request參數表示的就是攔截到的請求對象 request.meta['proxy'] = "https://151.106.15.3:1080" # request.meta={"https":"151.106.15.3:1080"} #不推薦 return None def process_response(self, request, response, spider): return response def process_exception(self, request, exception, spider): pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for dailiPro project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # https://doc.scrapy.org/en/latest/topics/settings.html 9 # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'dailiPro' 13 14 SPIDER_MODULES = ['dailiPro.spiders'] 15 NEWSPIDER_MODULE = 'dailiPro.spiders' 16 17 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 #USER_AGENT = 'dailiPro (+http://www.yourdomain.com)' 20 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 21 # Obey robots.txt rules 22 ROBOTSTXT_OBEY = False 23 24 # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 #CONCURRENT_REQUESTS = 32 26 27 # Configure a delay for requests for the same website (default: 0) 28 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 # See also autothrottle settings and docs 30 #DOWNLOAD_DELAY = 3 31 # The download delay setting will honor only one of: 32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 #CONCURRENT_REQUESTS_PER_IP = 16 34 35 # Disable cookies (enabled by default) 36 #COOKIES_ENABLED = False 37 38 # Disable Telnet Console (enabled by default) 39 #TELNETCONSOLE_ENABLED = False 40 41 # Override the default request headers: 42 #DEFAULT_REQUEST_HEADERS = { 43 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 # 'Accept-Language': 'en', 45 #} 46 47 # Enable or disable spider middlewares 48 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 #SPIDER_MIDDLEWARES = { 50 # 'dailiPro.middlewares.DailiproSpiderMiddleware': 543, 51 #} 52 53 # Enable or disable downloader middlewares 54 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 DOWNLOADER_MIDDLEWARES = { 56 'dailiPro.middlewares.DailiproDownloaderMiddleware': 543, 57 } 58 59 # Enable or disable extensions 60 # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 #EXTENSIONS = { 62 # 'scrapy.extensions.telnet.TelnetConsole': None, 63 #} 64 65 # Configure item pipelines 66 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 #ITEM_PIPELINES = { 68 # 'dailiPro.pipelines.DailiproPipeline': 300, 69 #} 70 71 # Enable and configure the AutoThrottle extension (disabled by default) 72 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 #AUTOTHROTTLE_ENABLED = True 74 # The initial download delay 75 #AUTOTHROTTLE_START_DELAY = 5 76 # The maximum download delay to be set in case of high latencies 77 #AUTOTHROTTLE_MAX_DELAY = 60 78 # The average number of requests Scrapy should be sending in parallel to 79 # each remote server 80 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 # Enable showing throttling stats for every response received: 82 #AUTOTHROTTLE_DEBUG = False 83 84 # Enable and configure HTTP caching (disabled by default) 85 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 #HTTPCACHE_ENABLED = True 87 #HTTPCACHE_EXPIRATION_SECS = 0 88 #HTTPCACHE_DIR = 'httpcache' 89 #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 92 #DEBUG INFO ERROR WARNING 93 #LOG_LEVEL = 'ERROR' 94 95 LOG_FILE = 'log.txt'
7.日誌信息的設置
日誌登記 #DEBUG INFO ERROR WARNING
在settings中寫 #LOG_LEVEL = 'ERROR' 只輸出error類型的日誌
LOG_FILE = 'log.txt'日誌輸出到文件,上看6.上面settings.py中配置
8.請求傳參 :爬取的數據不在同一個頁面中
正則未生效!???
例子8:請求傳參---爬取電影詳情的數據---moviePro
將不一樣頁面的值放到同一個item裏(名稱和做者)
手動發請求--yield
請求傳參:經過Request方法的meta參數將某一個具體的數據值傳遞給request方法中指定的callback方法,callback中方法經過response去取,
item = response.meta['item'] 一個取name,二級子頁面中取author
yield scrapy.Request(url=url,callback=self.getSencodPageText,meta={'item':item}
def getSencodPageText(self,response):
#2.接收Request方法傳遞過來的item對象
item = response.meta['item']
# -*- coding: utf-8 -*- import scrapy from moviePro.items import MovieproItem class MovieSpider(scrapy.Spider): name = 'movie' #allowed_domains = ['www.xxx.com'] start_urls = ['https://www.dy2018.com/html/gndy/dyzz/'] #該方法能夠將電影詳情頁中的數據進行解析 def getSencodPageText(self,response): #2.接收Request方法傳遞過來的item對象 item = response.meta['item'] actor = response.xpath('//*[@id="Zoom"]/p[16]/text()').extract_first() item['actor'] = actor yield item def parse(self, response): print(response.text) table_list = response.xpath('//div[@class="co_content8"]/ul/table') for table in table_list: url = "https://www.dy2018.com"+table.xpath('./tbody/tr[2]/td[2]/b/a/@href').extract_first() #須要加https前綴 name = table.xpath('./tbody/tr[2]/td[2]/b/a/text()').extract_first() print(url) item = MovieproItem() #實例化item類型對象 item['name']=name #1.讓Request方法將item對象傳遞給getSencodPageText方法,加入meta yield scrapy.Request(url=url,callback=self.getSencodPageText,meta={'item':item}) #手動發請求
9.SrawlSpider的使用--連接提取器&規則解析器
SrawlSpider能夠進行全棧數據的爬取! --重點!
例子9:SrawlSpider的使用--爬取糗百圖片全棧數據--crawlPro
注意:項目建立 scrapy genspider -t crawl qiubai www.xxx.com
取第一頁的標籤?--注意allow取得是符合正則的連接 link1 = LinkExtractor(allow=r'/pic/$')
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class QiubaiSpider(CrawlSpider): name = 'qiubai' #allowed_domains = ['www.xxx.com'] start_urls = ['https://www.qiushibaike.com/pic/'] #鏈接提取器(提取頁碼鏈接):從起始url表示的頁面源碼中進行指定鏈接的提取 #allow參數:正則表達式。能夠將起始url頁面源碼數據中符合該正則的鏈接進行所有的提取 link = LinkExtractor(allow=r'/pic/page/\d+\?s=\d+') #href="/pic/page/5?s=5144132" link1 = LinkExtractor(allow=r'/pic/$') #正則表達式提取到的是全部鏈接的內容 #href="/pic/" rules = ( #規則解析器:將鏈接提取器提取到的鏈接對應的頁面數據進行指定(callback)負責解析 #follow = True:將鏈接提取器繼續做用到鏈接提取器提取出的鏈接所對應的頁面中(會繼續做用於link中);爲False時,只會做用到start_urls,出現幾個結果。 Rule(link, callback='parse_item', follow=True), Rule(link1, callback='parse_item', follow=True), ) def parse_item(self, response): print(response)
10.分佈式爬取--多臺機器同時爬取同一頁面數據--重點!
在pycharm中下載redis
例子10:分佈式爬取--爬取抽屜42區--redisPro
#爬取抽屜42區全部圖片所對應的url鏈接
提交到redis中的管道
settings.py中ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 400
}
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import CrawlSpider, Rule 5 from scrapy_redis.spiders import RedisCrawlSpider 6 from redisPro.items import RedisproItem 7 #0.將RedisCrawlSpider類進行導入 8 #1.將爬蟲類的父類修改爲RedisCrawlSpider 9 #2.將start_urls修改爲redis_key屬性 10 #3.編寫具體的解析代碼 11 # 4.將item提交到scrapy-redis組件中被封裝好的管道里(settings.py中ITEM_PIPELINES = { 12 # 'scrapy_redis.pipelines.RedisPipeline': 400 13 # }) 14 #5.將爬蟲文件中產生的url對應的請求對象所有都提交到scrapy-redis封裝好的調度器中(settings.py中配置95-100) 15 #6.在配置文件中指明將爬取到的數據值存儲到哪個redis數據庫中(settings.py中105-108) 16 #7.對redis數據庫的配置文件(redis.windows.conf)進行修改:protected-mode no #bind 127.0.0.1 17 #8.執行爬蟲文件:scrapy runspider xxx.py 18 #9.向調度器中扔一個起始的url 19 class ChoutiSpider(RedisCrawlSpider): 20 name = 'chouti' 21 #allowed_domains = ['www.xxx.com'] 22 #start_urls = ['http://www.xxx.com/'] 23 #調度器隊列的名稱:將起始的url扔到該名稱表示的調度器隊列中 24 redis_key = "chouti" 25 26 rules = ( 27 Rule(LinkExtractor(allow=r'/r/news/hot/\d+'), callback='parse_item', follow=True), 28 ) 29 30 def parse_item(self, response): 31 32 imgUrl_list = response.xpath('//div[@class="news-pic"]/img/@src').extract() 33 for url in imgUrl_list: 34 item = RedisproItem() 35 item['url'] = url 36 37 yield item
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for redisPro project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # https://doc.scrapy.org/en/latest/topics/settings.html 9 # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'redisPro' 13 14 SPIDER_MODULES = ['redisPro.spiders'] 15 NEWSPIDER_MODULE = 'redisPro.spiders' 16 17 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 #USER_AGENT = 'redisPro (+http://www.yourdomain.com)' 20 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 21 # Obey robots.txt rules 22 ROBOTSTXT_OBEY = False 23 24 # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 #CONCURRENT_REQUESTS = 32 26 27 # Configure a delay for requests for the same website (default: 0) 28 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 # See also autothrottle settings and docs 30 #DOWNLOAD_DELAY = 3 31 # The download delay setting will honor only one of: 32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 #CONCURRENT_REQUESTS_PER_IP = 16 34 35 # Disable cookies (enabled by default) 36 #COOKIES_ENABLED = False 37 38 # Disable Telnet Console (enabled by default) 39 #TELNETCONSOLE_ENABLED = False 40 41 # Override the default request headers: 42 #DEFAULT_REQUEST_HEADERS = { 43 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 # 'Accept-Language': 'en', 45 #} 46 47 # Enable or disable spider middlewares 48 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 #SPIDER_MIDDLEWARES = { 50 # 'redisPro.middlewares.RedisproSpiderMiddleware': 543, 51 #} 52 53 # Enable or disable downloader middlewares 54 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 #DOWNLOADER_MIDDLEWARES = { 56 # 'redisPro.middlewares.RedisproDownloaderMiddleware': 543, 57 #} 58 59 # Enable or disable extensions 60 # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 #EXTENSIONS = { 62 # 'scrapy.extensions.telnet.TelnetConsole': None, 63 #} 64 65 # Configure item pipelines 66 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 ITEM_PIPELINES = { 68 'scrapy_redis.pipelines.RedisPipeline': 400 69 70 # 'redisPro.pipelines.RedisproPipeline': 300, 71 72 } 73 74 # Enable and configure the AutoThrottle extension (disabled by default) 75 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 76 #AUTOTHROTTLE_ENABLED = True 77 # The initial download delay 78 #AUTOTHROTTLE_START_DELAY = 5 79 # The maximum download delay to be set in case of high latencies 80 #AUTOTHROTTLE_MAX_DELAY = 60 81 # The average number of requests Scrapy should be sending in parallel to 82 # each remote server 83 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 84 # Enable showing throttling stats for every response received: 85 #AUTOTHROTTLE_DEBUG = False 86 87 # Enable and configure HTTP caching (disabled by default) 88 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 89 #HTTPCACHE_ENABLED = True 90 #HTTPCACHE_EXPIRATION_SECS = 0 91 #HTTPCACHE_DIR = 'httpcache' 92 #HTTPCACHE_IGNORE_HTTP_CODES = [] 93 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 94 95 # 使用scrapy-redis組件的去重隊列 96 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 97 # 使用scrapy-redis組件本身的調度器 98 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 99 # 是否容許暫停 100 SCHEDULER_PERSIST = True 101 102 103 104 105 REDIS_HOST = '192.168.12.65' 106 REDIS_PORT = 6379 107 #REDIS_ENCODING = ‘utf-8’ 108 #REDIS_PARAMS = {‘password’:’123456’}
redis配置文件中註釋56行 75保存模式改成no
運行:
1.啓動redis服務器:進入到redis目錄,在cmd中輸入redis-server ./redis.windows.conf
2.啓動redis 數據庫客戶端:redis-cli
3.執行配置文件:cmd進入到F:\Python自動化21期\3.Django&項目\day26 爬蟲1104\課上代碼及筆記\scrapy項目\redisPro\redisPro\spiders下的目錄,
scrapy runspider chouti.py 會停在監聽的位置
4.在redis中:redis-cli
lpush chouti https://dig.chouti.com/r/news/hot/1 執行以後項目cmd中會進行數據爬取操做
5.在redis中查看爬取的數據
keys * -------存在chouti:items
lrange chouti:items 0 -1
刪除數據:redis cli
flushall便可
小結18:40-50 總結的答案:1.2種爬蟲模塊,requests、urllib2.robots協議做用:防君子不妨小人,經常使用的一種反扒手段3.使用雲打碼或者人工識別--注:驗證碼也是門戶網站的一種反扒手段4.3種解析方式:xpath、BeautifulSoup、正則5.selenium--執行js代碼/PhantomJs、谷歌無頭瀏覽器6.重要!數據加密(下載密文),動態數據爬取(梨視頻)token--登陸時rkey對應的值7.5個,爬蟲文件、引擎、調度器、下載器、管道8.sqiders/CrawlSpider/RedisCrawlSpider9.總結的10步---能夠本身嘗試--分佈式樣本保存10.未講到想要的內容括起來