本內容將圍繞網易新聞數據抓取爲例,學習使用UA池、代理池及selenium在scapy框架中的應用html
抓取的是基於文字的新聞數據(國內,國際,軍事、航空)web
1 def parse(self, response): 2 menu_list = response.xpath('//div[@class="ns_area list"]/ul/li') 3 for menu in [3,4,6,7]: 4 url = menu_list[menu].xpath('./a/@href').extract_first() 5 title = menu_list[menu].xpath('./a/text()').extract_first() 6 7 #實例化item對象 8 item = WangyiProItem() 9 item['title']=title 10 11 #對每個版塊對應的url發起請求、獲取頁面數據(標題、縮略圖、關鍵字、發佈時間、url),並將item對象進行傳遞 12 yield scrapy.Request(url=url,callback=self.parse_second,meta={'item':item})
1 def parse_second(self,response): 2 #獲取頁面新聞的全部數據 3 all_data_list = response.xpath('//div[@class="ndi_main"]/div') 4 #獲取傳遞過來的item對象 5 item = response.meta['item'] 6 #進行所需信息的提取 7 for data in all_data_list: 8 head = data.xpath('.//div[@class="news_title"]/h3/a/text()').extract_first() 9 url = data.xpath('./a/@href').extract_first() 10 img_url = data.xpath('./a/img/@src').extract_first() 11 tag_list = data.xpath('.//div[@class="news_tag"]//text()').extract() 12 tags=[] 13 #去除所提取的tag的空格和換行,注:此處用normalize-space()方法居然不生效,緣由正在研究中 14 for t in tag_list: 15 t=t.strip('\n \t ') 16 tags.append(t) 17 tag = ''.join(tags) 18 19 item['head']=head 20 item['url'] = url 21 item['img_url']=img_url 22 item['tag']=tag 23 24 #針對所提取的url進行相關新聞詳細內容的解析,此解析在get_content中進行 25 yield scrapy.Request(url=url,callback=self.get_content,meta={'item':item})
通過執行,咱們發現並未有任何數據打印,通過分析緣由後發現,網易新聞的頁面都是動態加載的,而咱們直接獲取的網頁數據是靜態的,所以,咱們須要藉助seleniumredis
進行動態頁面的處理。chrome
1)在爬蟲文件中導入webdriver類:from selenium import webdriver數據庫
2)在爬蟲文件的爬蟲類的構造方法中進行瀏覽器實例化的操做windows
3)在爬蟲類的closed方法中進行瀏覽器關閉操做瀏覽器
1 def __init__(self): 2 #此處咱們使用phantomjs瀏覽器,chrome在執行過程當中會出現與windows的兼容性問題(需解決) 3 #實例化一個瀏覽器對象(程序開始時實例化一次) 4 self.bro = webdriver.PhantomJS(executable_path=r'E:\Study\Python\PycharmProjects\reptile\wangyi_pro\wangyi_pro\spiders\phantomjs-2.1.1-windows\bin\phantomjs.exe') 5 def closed(self,spider): 6 #必須在整個爬蟲結束後關閉瀏覽器 7 print("爬蟲結束") 8 self.bro.quit()
4)在下載中間件的process_response方法中執行瀏覽器的自動化操做cookie
1 from scrapy.http import HtmlResponse 2 #用於篡改重構頁面數據 3 import time 4 #用於緩衝時間 5 6 ''' 7 攔截到響應對象(下載器傳遞給spider的響應對象),對響應對象頁面數據進行篡改 8 :param request: 響應對象對應的請求對象 9 :param response: 攔截到的響應對象 10 :param spider: 爬蟲文件中對應的爬蟲類實例 11 ''' 12 def process_response(self, request, response, spider): 13 if request.url in ['http://news.163.com/domestic/','http://news.163.com/world/','http://war.163.com/','http://news.163.com/air/']: 14 spider.bro.get(url=request.url) 15 #使瀏覽器頁面自動滾動至底部 16 js = 'window.scrollTo(0,document.body.scrollHeight)' 17 spider.bro.execute_script(js) 18 time.sleep(2) #必定要給瀏覽器緩衝加載數據的時間 19 #頁面數據就包含了動態加載出來的新聞數據 20 page_text = spider.bro.page_source 21 return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request) 22 else: 23 return response
5)settings.py 啓用中間件(至此動態獲取頁面數據結束)app
1 DOWNLOADER_MIDDLEWARES = { 2 'wangyi_pro.middlewares.WangyiProDownloaderMiddleware': 543, 3 }
1)導包:from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware框架
2)封裝一個基於UserAgentMiddleware的類,且重寫該類的process_requests方法
1 #我的感受scrapy.contrib已被廢棄,沒法完成導包,所以此方法不可用 2 ''' 3 from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware 4 import random 5 # UA池代碼的編寫(單獨給UA池封裝一個下載中間件的一個類) 6 class random_user_agent(UserAgentMiddleware): 7 8 def process_request(self, request, spider): 9 # 從列表中隨機抽選出一個ua值 10 ua = random.choice(user_agent_list) 11 # ua值進行當前攔截到請求的ua的寫入操做 12 request.headers.setdefault('User-Agent', ua) 13 ''' 14 #此方法可用 15 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 16 import time,random 17 user_agent_list = [ 18 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " 19 "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 20 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " 21 "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 22 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " 23 "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 24 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " 25 "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 26 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " 27 "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 28 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " 29 "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 30 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " 31 "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 32 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 33 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 34 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " 35 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 36 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " 37 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 38 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 39 "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 40 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 41 "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 42 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 43 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 44 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 45 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 46 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " 47 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 48 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 49 "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 50 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " 51 "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 52 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " 53 "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 54 ] 55 class random_user_agent(UserAgentMiddleware): 56 def process_request(self, request, spider): 57 ua = random.choice(user_agent_list) 58 request.headers['User-Agent']=ua 59 print(request.headers['User-Agent'])
3)在settings.py中開啓對應中間件
1 DOWNLOADER_MIDDLEWARES = { 2 'wangyi_pro.middlewares.random_user_agent': 542, 3 }
class random_proxy(object): def process_request(self, request, spider): # 對攔截到請求的url進行判斷(協議頭究竟是http仍是https) # request.url返回值:http://www.xxx.com h = request.url.split(':')[0] # 請求的協議頭 if h == 'https': ip = random.choice(PROXY_https) request.meta['proxy'] = 'https://' + ip else: ip = random.choice(PROXY_http) request.meta['proxy'] = 'http://' + ip
在settings.py中開啓對應中間件:
1 DOWNLOADER_MIDDLEWARES = { 2 'wangyi_pro.middlewares.random_proxy': 541, 3 }
注:在scrapy-redis組件中已經幫助咱們封裝好了一個專門用於鏈接存儲redis數據庫的管道(RedisPipeline),所以咱們直接使用便可,無需本身編寫管道文件。
1)代碼修改
a.導包:from scrapy_redis.spiders import RedisSpider
b.將爬蟲文件的父類修改爲RedisSpider:class WangyiNewsSpider(RedisSpider):
c.註釋掉起始url,並添加一個redis_key(調度器隊列名稱)的屬性
1 #start_urls = ['https://news.163.com/'] 2 redis_key = 'wangyi' 3
因爲其他配置步驟與基於CrawlSpider的分佈式一致,剩餘步驟請參閱基於CrawlSpider的內容:https://www.cnblogs.com/TigerAt/p/10595571.html
完整代碼以下:
1 # -*- coding: utf-8 -*- 2 import scrapy 3 #導入webserver類 4 from selenium import webdriver 5 from scrapy_redis.spiders import RedisSpider 6 from wangyi_pro.items import WangyiProItem 7 8 9 class WangyiNewsSpider(RedisSpider): 10 name = 'wangyi_news' 11 #allowed_domains = ['https://news.163.com/'] 12 #start_urls = ['https://news.163.com/'] 13 redis_key = 'wangyi' 14 15 def __init__(self): 16 #實例化一個瀏覽器對象(程序開始時實例化一次) 17 self.bro = webdriver.PhantomJS(executable_path=r'E:\Study\Python\PycharmProjects\reptile\wangyi_pro\wangyi_pro\spiders\phantomjs-2.1.1-windows\bin\phantomjs.exe') 18 def closed(self,spider): 19 #必須在整個爬蟲結束後關閉瀏覽器 20 print("爬蟲結束") 21 self.bro.quit() 22 def get_content(self,response): 23 item = response.meta['item'] 24 content_list=response.xpath('//div[@class="post_text"]/p/text()').extract() 25 content="".join(content_list) 26 item['content']=content 27 yield item 28 29 def parse_second(self,response): 30 #獲取頁面新聞的全部數據 31 all_data_list = response.xpath('//div[@class="ndi_main"]/div') 32 #獲取傳遞過來的item對象 33 item = response.meta['item'] 34 #進行所需信息的提取 35 for data in all_data_list: 36 head = data.xpath('.//div[@class="news_title"]/h3/a/text()').extract_first() 37 url = data.xpath('./a/@href').extract_first() 38 img_url = data.xpath('./a/img/@src').extract_first() 39 tag_list = data.xpath('.//div[@class="news_tag"]//text()').extract() 40 tags=[] 41 #去除所提取的tag的空格和換行,注:此處用normalize-space()方法居然不生效,緣由正在研究中 42 for t in tag_list: 43 t=t.strip('\n \t ') 44 tags.append(t) 45 tag = ''.join(tags) 46 47 item['head']=head 48 item['url'] = url 49 item['img_url']=img_url 50 item['tag']=tag 51 52 #針對所提取的url進行相關新聞詳細內容的解析,此解析在get_content中進行 53 yield scrapy.Request(url=url,callback=self.get_content,meta={'item':item}) 54 55 def parse(self, response): 56 menu_list = response.xpath('//div[@class="ns_area list"]/ul/li') 57 for menu in [3,4,6,7]: 58 url = menu_list[menu].xpath('./a/@href').extract_first() 59 title = menu_list[menu].xpath('./a/text()').extract_first() 60 61 #實例化item對象 62 item = WangyiProItem() 63 item['title']=title 64 65 #對每個版塊對應的url發起請求、獲取頁面數據(標題、縮略圖、關鍵字、發佈時間、url),並將item對象進行傳遞 66 yield scrapy.Request(url=url,callback=self.parse_second,meta={'item':item})
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your spider middleware 4 # 5 # See documentation in: 6 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 8 from scrapy import signals 9 from scrapy.http import HtmlResponse 10 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 11 import time,random 12 user_agent_list = [ 13 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " 14 "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 15 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " 16 "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 17 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " 18 "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 19 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " 20 "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 21 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " 22 "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 23 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " 24 "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 25 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " 26 "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 27 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 28 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 29 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " 30 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 31 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " 32 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 33 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 34 "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 35 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 36 "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 37 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 38 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 39 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 40 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 41 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " 42 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 43 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 44 "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 45 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " 46 "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 47 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " 48 "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 49 ] 50 #設置UA池 51 class random_user_agent(UserAgentMiddleware): 52 def process_request(self, request, spider): 53 ua = random.choice(user_agent_list) 54 request.headers['User-Agent']=ua 55 56 PROXY_http = [ 57 '89.19.0.46:80', 58 '120.210.219.101:8080', 59 ] 60 PROXY_https = [ 61 '119.176.82.210:9999', 62 '111.198.154.116:8888', 63 ] 64 class random_proxy(object): 65 def process_request(self, request, spider): 66 # 對攔截到請求的url進行判斷(協議頭究竟是http仍是https) 67 # request.url返回值:http://www.xxx.com 68 h = request.url.split(':')[0] # 請求的協議頭 69 if h == 'https': 70 ip = random.choice(PROXY_https) 71 request.meta['proxy'] = 'https://' + ip 72 else: 73 ip = random.choice(PROXY_http) 74 request.meta['proxy'] = 'http://' + ip 75 76 class WangyiProDownloaderMiddleware(object): 77 ''' 78 攔截到響應對象(下載器傳遞給spider的響應對象),對響應對象頁面數據進行篡改 79 :param request: 響應對象對應的請求對象 80 :param response: 攔截到的響應對象 81 :param spider: 爬蟲文件中對應的爬蟲類實例 82 ''' 83 def process_response(self, request, response, spider): 84 if request.url in ['http://news.163.com/domestic/','http://news.163.com/world/','http://war.163.com/','http://news.163.com/air/']: 85 spider.bro.get(url=request.url) 86 #使瀏覽器頁面自動滾動至底部 87 js = 'window.scrollTo(0,document.body.scrollHeight)' 88 spider.bro.execute_script(js) 89 time.sleep(2) #必定要給瀏覽器緩衝加載數據的時間 90 #頁面數據就包含了動態加載出來的新聞數據 91 page_text = spider.bro.page_source 92 return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request) 93 else: 94 return response
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # https://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class WangyiProItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 head = scrapy.Field() 15 url = scrapy.Field() 16 img_url = scrapy.Field() 17 tag = scrapy.Field() 18 title = scrapy.Field() 19 content = scrapy.Field()
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for wangyi_pro project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # https://doc.scrapy.org/en/latest/topics/settings.html 9 # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'wangyi_pro' 13 14 SPIDER_MODULES = ['wangyi_pro.spiders'] 15 NEWSPIDER_MODULE = 'wangyi_pro.spiders' 16 17 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 #USER_AGENT = 'wangyi_pro (+http://www.yourdomain.com)' 20 # Obey robots.txt rules 21 ROBOTSTXT_OBEY = False 22 23 # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 #CONCURRENT_REQUESTS = 32 25 26 # Configure a delay for requests for the same website (default: 0) 27 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 28 # See also autothrottle settings and docs 29 #DOWNLOAD_DELAY = 3 30 # The download delay setting will honor only one of: 31 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 #CONCURRENT_REQUESTS_PER_IP = 16 33 34 # Disable cookies (enabled by default) 35 #COOKIES_ENABLED = False 36 37 # Disable Telnet Console (enabled by default) 38 #TELNETCONSOLE_ENABLED = False 39 40 # Override the default request headers: 41 #DEFAULT_REQUEST_HEADERS = { 42 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 # 'Accept-Language': 'en', 44 #} 45 46 # Enable or disable spider middlewares 47 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 48 #SPIDER_MIDDLEWARES = { 49 # 'wangyi_pro.middlewares.WangyiProSpiderMiddleware': 543, 50 #} 51 52 # Enable or disable downloader middlewares 53 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 54 DOWNLOADER_MIDDLEWARES = { 55 'wangyi_pro.middlewares.WangyiProDownloaderMiddleware': 543, 56 'wangyi_pro.middlewares.random_user_agent': 542, 57 'wangyi_pro.middlewares.random_proxy': 541, 58 } 59 60 # Enable or disable extensions 61 # See https://doc.scrapy.org/en/latest/topics/extensions.html 62 #EXTENSIONS = { 63 # 'scrapy.extensions.telnet.TelnetConsole': None, 64 #} 65 66 # Configure item pipelines 67 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 68 ITEM_PIPELINES = { 69 #'wangyi_pro.pipelines.WangyiProPipeline': 300, 70 'scrapy_redis.pipelines.RedisPipeline': 400, 71 } 72 73 # Enable and configure the AutoThrottle extension (disabled by default) 74 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 75 #AUTOTHROTTLE_ENABLED = True 76 # The initial download delay 77 #AUTOTHROTTLE_START_DELAY = 5 78 # The maximum download delay to be set in case of high latencies 79 #AUTOTHROTTLE_MAX_DELAY = 60 80 # The average number of requests Scrapy should be sending in parallel to 81 # each remote server 82 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 # Enable showing throttling stats for every response received: 84 #AUTOTHROTTLE_DEBUG = False 85 86 # Enable and configure HTTP caching (disabled by default) 87 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 #HTTPCACHE_ENABLED = True 89 #HTTPCACHE_EXPIRATION_SECS = 0 90 #HTTPCACHE_DIR = 'httpcache' 91 #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 94 95 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 96 # 使用scrapy-redis組件本身的調度器 97 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 98 # 是否容許暫停 99 SCHEDULER_PERSIST = True 100 101 # #redis數據庫非本地存儲時使用以下命令進行配置 102 REDIS_HOST = 'x.x.x.x' 103 REDIS_PORT = 6379 104 REDIS_ENCODING = 'utf-8'
注:本內容僅供學習參閱,請勿他用。