今日內容概要html
一、高性能相關python
二、scrapy初識react
上節回顧:git
1. Http協議 Http協議:GET / http1.1/r/n...../r/r/r/na=1 TCP協議:sendall("GET / http1.1/r/n...../r/r/r/na=1") 2. 請求體 GET: GET / http1.1/r/n...../r/r/r/n POST: POST / http1.1/r/n...../r/r/r/na=1&b=2 POST / http1.1/r/n...../r/r/r/{"k1":123} PS: 依據Content-Type請求頭 3. requests模塊 - method - url - params - data - json - headers - cookies - proxies 4. BeautifulSoup4模塊 HTML XML 5. Web微信 - 輪詢 - 長輪詢
通常的網站都會用host,referer,cookies作防盜鏈,當遇到獲取圖片地址異常,能夠嘗試在headers裏添加host或者referer或者加cookiesgithub
在編寫爬蟲時,性能的消耗主要在IO請求中,當單進程單線程模式下請求URL時必然會引發等待,從而使得請求總體變慢。web
import requests def fetch_async(url): response = requests.get(url) return response url_list = ['http://www.github.com', 'http://www.bing.com'] for url in url_list: fetch_async(url)
import requests from concurrent.futures import ThreadPoolExecutor def fetch_async(url): print('請求開始') response = requests.get(url) print(response.text) url_list = ['http://www.baidu.com','http://www.bing.com'] pool = ThreadPoolExecutor(5) for url in url_list: pool.submit(fetch_async,url) pool.shutdown(wait=True)
import requests from concurrent.futures import ThreadPoolExecutor def fetch_async(url): print('請求開始') response = requests.get(url) return response def call_back(res): print('開始執行回調') print(res.result()) url_list = ['http://www.baidu.com','http://www.bing.com'] pool = ThreadPoolExecutor(5) for url in url_list: v = pool.submit(fetch_async,url) v.add_done_callback(call_back) pool.shutdown(wait=True)
from concurrent.futures import ProcessPoolExecutor import requests def fetch_async(url): response = requests.get(url) return response url_list = ['http://www.github.com', 'http://www.bing.com'] pool = ProcessPoolExecutor(5) for url in url_list: pool.submit(fetch_async, url) pool.shutdown(wait=True)
from concurrent.futures import ProcessPoolExecutor import requests def fetch_async(url): response = requests.get(url) return response def callback(future): print(future.result()) url_list = ['http://www.github.com', 'http://www.bing.com'] pool = ProcessPoolExecutor(5) for url in url_list: v = pool.submit(fetch_async, url) v.add_done_callback(callback) pool.shutdown(wait=True)
經過上述代碼都可以完成對請求性能的提升,對於多線程和多進行的缺點是在IO阻塞時會形成了線程和進程的浪費,因此異步IO回事首選:shell
異步IO解釋: 異步表明回調,非阻塞併發json
import asyncio @asyncio.coroutine def func1(): print('before...func1....') yield from asyncio.sleep(5) print('end...func1...') tasks = [func1(),func1()] loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.gather(*tasks)) loop.close() ''' before...func1.... before...func1.... end...func1... end...func1... '''
**socket_server和client之間通訊存在4個阻塞的地方:windows
一、socket_server啓動時候,連接循環是阻塞的api
二、socket_server的通訊循環 send後recv是阻塞的
三、client啓動的時候connect_server是阻塞的
四、client send消息後 recv是阻塞的
import asyncio @asyncio.coroutine def fetch_async(host, url='/'): print(host, url) reader, writer = yield from asyncio.open_connection(host, 80) request_header_content = """GET %s HTTP/1.0\r\nHost: %s\r\n\r\n""" % (url, host,) request_header_content = bytes(request_header_content, encoding='utf-8') writer.write(request_header_content) yield from writer.drain() text = yield from reader.read() print(host, url, text) writer.close() tasks = [ fetch_async('www.cnblogs.com', '/wupeiqi/'), fetch_async('dig.chouti.com', '/pic/show?nid=4073644713430508&lid=10273091') ] loop = asyncio.get_event_loop() results = loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
import aiohttp import asyncio @asyncio.coroutine def fetch_async(url): print(url) response = yield from aiohttp.request('GET', url) # data = yield from response.read() # print(url, data) print(url, response) response.close() tasks = [fetch_async('http://www.google.com/'), fetch_async('http://www.chouti.com/')] event_loop = asyncio.get_event_loop() results = event_loop.run_until_complete(asyncio.gather(*tasks)) event_loop.close()
import asyncio import requests @asyncio.coroutine def fetch_async(func, *args): loop = asyncio.get_event_loop() future = loop.run_in_executor(None, func, *args) response = yield from future print(response.url, response.content) tasks = [ fetch_async(requests.get, 'http://www.cnblogs.com/wupeiqi/'), fetch_async(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091') ] loop = asyncio.get_event_loop() results = loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
import gevent import requests from gevent import monkey monkey.patch_all() def fetch_async(method, url, req_kwargs): print(method, url, req_kwargs) response = requests.request(method=method, url=url, **req_kwargs) print(response.url, response.content) # ##### 發送請求 ##### gevent.joinall([ gevent.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}), gevent.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}), gevent.spawn(fetch_async, method='get', url='https://github.com/', req_kwargs={}), ]) # ##### 發送請求(協程池控制最大協程數量) ##### # from gevent.pool import Pool # pool = Pool(None) # gevent.joinall([ # pool.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}), # pool.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}), # pool.spawn(fetch_async, method='get', url='https://www.github.com/', req_kwargs={}), # ])
import grequests request_list = [ grequests.get('http://httpbin.org/delay/1', timeout=0.001), grequests.get('http://fakedomain/'), grequests.get('http://httpbin.org/status/500') ] # ##### 執行並獲取響應列表 ##### # response_list = grequests.map(request_list) # print(response_list) # ##### 執行並獲取響應列表(處理異常) ##### # def exception_handler(request, exception): # print(request,exception) # print("Request failed") # response_list = grequests.map(request_list, exception_handler=exception_handler) # print(response_list)
from twisted.web.client import getPage, defer from twisted.internet import reactor def all_done(arg): reactor.stop() #終止死循環 def callback(contents): print(contents) deferred_list = [] url_list = ['http://www.bing.com', 'http://www.baidu.com', ] for url in url_list: deferred = getPage(bytes(url, encoding='utf8')) deferred.addCallback(callback) deferred_list.append(deferred) dlist = defer.DeferredList(deferred_list) dlist.addBoth(all_done) reactor.run() #至關於一個死循環一直監聽線程的執行狀態
from tornado.httpclient import AsyncHTTPClient from tornado.httpclient import HTTPRequest from tornado import ioloop def handle_response(response): """ 處理返回值內容(須要維護計數器,來中止IO循環),調用 ioloop.IOLoop.current().stop() :param response: :return: """ if response.error: print("Error:", response.error) else: print(response.body) def func(): url_list = [ 'http://www.baidu.com', 'http://www.bing.com', ] for url in url_list: print(url) http_client = AsyncHTTPClient() http_client.fetch(HTTPRequest(url), handle_response) ioloop.IOLoop.current().add_callback(func) ioloop.IOLoop.current().start()
from twisted.internet import reactor from twisted.web.client import getPage import urllib.parse def one_done(arg): print(arg) reactor.stop() post_data = urllib.parse.urlencode({'check_data': 'adf'}) post_data = bytes(post_data, encoding='utf8') headers = {b'Content-Type': b'application/x-www-form-urlencoded'} response = getPage(bytes('http://dig.chouti.com/login', encoding='utf8'), method=bytes('POST', encoding='utf8'), postdata=post_data, cookies={}, headers=headers) response.addBoth(one_done) reactor.run()
以上均是Python內置以及第三方模塊提供異步IO請求模塊,使用簡便大大提升效率,而對於異步IO請求的本質則是【非阻塞Socket】+【IO多路複用】:
import select import socket import time class AsyncTimeoutException(TimeoutError): """ 請求超時異常類 """ def __init__(self, msg): self.msg = msg super(AsyncTimeoutException, self).__init__(msg) class HttpContext(object): """封裝請求和相應的基本數據""" def __init__(self, sock, host, port, method, url, data, callback, timeout=5): """ sock: 請求的客戶端socket對象 host: 請求的主機名 port: 請求的端口 port: 請求的端口 method: 請求方式 url: 請求的URL data: 請求時請求體中的數據 callback: 請求完成後的回調函數 timeout: 請求的超時時間 """ self.sock = sock self.callback = callback self.host = host self.port = port self.method = method self.url = url self.data = data self.timeout = timeout self.__start_time = time.time() self.__buffer = [] def is_timeout(self): """當前請求是否已經超時""" current_time = time.time() if (self.__start_time + self.timeout) < current_time: return True def fileno(self): """請求sockect對象的文件描述符,用於select監聽""" return self.sock.fileno() def write(self, data): """在buffer中寫入響應內容""" self.__buffer.append(data) def finish(self, exc=None): """在buffer中寫入響應內容完成,執行請求的回調函數""" if not exc: response = b''.join(self.__buffer) self.callback(self, response, exc) else: self.callback(self, None, exc) def send_request_data(self): content = """%s %s HTTP/1.0\r\nHost: %s\r\n\r\n%s""" % ( self.method.upper(), self.url, self.host, self.data,) return content.encode(encoding='utf8') class AsyncRequest(object): def __init__(self): self.fds = [] self.connections = [] def add_request(self, host, port, method, url, data, callback, timeout): """建立一個要請求""" client = socket.socket() client.setblocking(False) try: client.connect((host, port)) except BlockingIOError as e: pass # print('已經向遠程發送鏈接的請求') req = HttpContext(client, host, port, method, url, data, callback, timeout) self.connections.append(req) self.fds.append(req) def check_conn_timeout(self): """檢查全部的請求,是否有已經鏈接超時,若是有則終止""" timeout_list = [] for context in self.connections: if context.is_timeout(): timeout_list.append(context) for context in timeout_list: context.finish(AsyncTimeoutException('請求超時')) self.fds.remove(context) self.connections.remove(context) def running(self): """事件循環,用於檢測請求的socket是否已經就緒,從而執行相關操做""" while True: r, w, e = select.select(self.fds, self.connections, self.fds, 0.05) if not self.fds: return for context in r: sock = context.sock while True: try: data = sock.recv(8096) if not data: self.fds.remove(context) context.finish() break else: context.write(data) except BlockingIOError as e: break except TimeoutError as e: self.fds.remove(context) self.connections.remove(context) context.finish(e) break for context in w: # 已經鏈接成功遠程服務器,開始向遠程發送請求數據 if context in self.fds: data = context.send_request_data() context.sock.sendall(data) self.connections.remove(context) self.check_conn_timeout() if __name__ == '__main__': def callback_func(context, response, ex): """ :param context: HttpContext對象,內部封裝了請求相關信息 :param response: 請求響應內容 :param ex: 是否出現異常(若是有異常則值爲異常對象;不然值爲None) :return: """ print(context, response, ex) obj = AsyncRequest() url_list = [ {'host': 'www.google.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5, 'callback': callback_func}, {'host': 'www.baidu.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5, 'callback': callback_func}, {'host': 'www.bing.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5, 'callback': callback_func}, ] for item in url_list: print(item) obj.add_request(**item) obj.running()
IO多路複用:select,用於檢測socket對象是否發生變化(是否鏈接成功,是否有數據到來)
import socket import select class Request(object): def __init__(self,sock,func,url): self.sock = sock self.func = func self.url = url def fileno(self): return self.sock.fileno() #獲取socket對象文件描述符 def async_request(url_list): input_list = [] conn_list = [] for url in url_list: client = socket.socket() client.setblocking(False) # 建立鏈接,不阻塞 try: client.connect((url[0],80,)) # 100個向百度發送的請求 except BlockingIOError as e: pass obj = Request(client,url[1],url[0]) input_list.append(obj) conn_list.append(obj) while True: # 監聽socket是否已經發生變化 [request_obj,request_obj....request_obj] # 若是有請求鏈接成功:wlist = [request_obj,request_obj] # 若是有響應的數據: rlist = [request_obj,request_obj....client100] rlist,wlist,elist = select.select(input_list,conn_list,[],0.05) for request_obj in wlist: # print('鏈接成功') # # # # 發送Http請求 # print('發送請求') request_obj.sock.sendall("GET / HTTP/1.0\r\nhost:{0}\r\n\r\n".format(request_obj.url).encode('utf-8')) conn_list.remove(request_obj) for request_obj in rlist: data = request_obj.sock.recv(8096) request_obj.func(data) request_obj.sock.close() input_list.remove(request_obj) if not input_list: break
def callback1(data): print('百度回來了',data) def callback2(data): print('必應回來了',data) url_list = [ ['www.baidu.com',callback1], ['www.bing.com',callback2] ] s2.async_request(url_list)
經典回答錄:
使用一個線程完成併發操做,如何併發? 當第一個任務到來時,先發送鏈接請求,此時會發生IO等待,可是我不等待,我繼續發送第二個任務的鏈接請求.... IO多路複用監聽socket變化 先鏈接成功: 發送請求信息: GET / http/1.0\r\nhost.... 遇到IO等待,不等待,繼續檢測是否有人鏈接成功: 發送請求信息: GET / http/1.0\r\nhost.... 遇到IO等待,不等待,繼續檢測是否有人鏈接成功: 發送請求信息: GET / http/1.0\r\nhost.... 有結果返回: 讀取返回內容,執行回調函數 讀取返回內容,執行回調函數 讀取返回內容,執行回調函數 讀取返回內容,執行回調函數 讀取返回內容,執行回調函數 讀取返回內容,執行回調函數 讀取返回內容,執行回調函數 問題:什麼是協程? 單純的執行一端代碼後,調到另一端代碼執行,再繼續跳... 異步IO: - 【基於協程】能夠用 協程+非阻塞socket+select實現,gevent - 【基於事件循環】徹底通用socket+select實現,Twsited 1. 如何提升爬蟲併發? 利用異步IO模塊,如:asyncio,twisted,gevent 本質: - 【基於協程】能夠用 協程+非阻塞socket+select實現,gevent - 【基於事件循環】徹底通用socket+select實現,Twsited,tornado 2. 異步非阻塞 異步:回調 select 非阻塞:不等待 setblocking(False) 3. 什麼是協程? 攜程是人工去定義如何切換,遇到io阻塞就切換 pip3 install gevent from greenlet import greenlet def test1(): print(12) gr2.switch() print(34) gr2.switch() def test2(): print(56) gr1.switch() print(78) gr1 = greenlet(test1) gr2 = greenlet(test2) gr1.switch()
Scrapy是一個爲了爬取網站數據,提取結構性數據而編寫的應用框架。 其能夠應用在數據挖掘,信息處理或存儲歷史數據等一系列的程序中。
其最初是爲了頁面抓取 (更確切來講, 網絡抓取 )所設計的, 也能夠應用在獲取API所返回的數據(例如 Amazon Associates Web Services ) 或者通用的網絡爬蟲。Scrapy用途普遍,能夠用於數據挖掘、監測和自動化測試。
Scrapy 使用了 Twisted異步網絡庫來處理網絡通信。總體架構大體以下:
執行流程:
啓動scrapy經過scrapy_engine將任務放入scheduler裏(隊列),執行requests進行下載頁面,將返回值傳給spiders(這個能夠有多個),spider處理能夠經過items和pipeline進行數據持久化,也能夠進行遞歸回調 再次將新任務投放到scheduler裏
Scrapy主要包括瞭如下組件:
Scrapy運行流程大概以下:
Linux pip3 install scrapy Windows a. pip3 install wheel b. 下載twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted c. 進入下載目錄,執行 pip3 install Twisted‑17.1.0‑cp35‑cp35m‑win_amd64.whl d. pip3 install scrapy e. 下載並安裝pywin32:https://sourceforge.net/projects/pywin32/files/
f. pip3 install pypiwin32 #若是找不到python_dir就用pip3安裝
1. scrapy startproject 項目名稱 - 在當前目錄中建立中建立一個項目文件(相似於Django) 2. scrapy genspider [-t template] <name> <domain> - 建立爬蟲應用 如: scrapy gensipider -t basic oldboy oldboy.com scrapy gensipider -t xmlfeed autohome autohome.com.cn PS: 查看全部命令:scrapy gensipider -l 查看模板命令:scrapy gensipider -d 模板名稱 3. scrapy list - 展現爬蟲應用列表 4. scrapy crawl 爬蟲應用名稱 - 運行單獨爬蟲應用 5.不輸出調試日誌 scrapy crawl quotes --nolog 6.終端調試 scrapy shell quotes.toscrape.com 7.生成一個json文件 scrapy crawl quotes -o quotes.json 8.利用download下載源代碼 並經過瀏覽器顯示 scrapy view http://www.chuchujie.com 9.格式化輸出 #最後的parse是方法 scrapy parse http://quotes.toscrape.com -c parse 10.run方法執行文件 scrapy runspider spiders/quotes.py
project_name/ scrapy.cfg project_name/ __init__.py items.py pipelines.py settings.py spiders/ __init__.py 爬蟲1.py 爬蟲2.py 爬蟲3.py
文件說明:
注意:通常建立爬蟲文件時,以網站域名命名
windows若是出現編碼問題:
import sys,os sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
例子:
爬取抽屜新聞內容:
# -*- coding: utf-8 -*- import scrapy import io,os,sys sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') from scrapy.selector import HtmlXPathSelector from ..items import Sp1Item from scrapy.http import Request class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] # start_urls = ['http://dig.chouti.com/',] def start_requests(self): yield Request(url="http://dig.chouti.com/",headers={},callback=self.parse) def parse(self, response): # print(response.body) # print(response.text) hxs = HtmlXPathSelector(response) # result = hxs.select('//div[@id="yellow-msg-box-intohot"]') item_list = hxs.select('//div[@id="content-list"]/div[@class="item"]') for item in item_list: # item.select('./div[@class="news-content"]/div[@class="part2"]/text()').extract() # item.select('./div[@class="news-content"]/div[@class="part2"]/text()').extract_first() title = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-title').extract_first() url = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-pic').extract_first() # v = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-title').extract_first() obj = Sp1Item(title=title,url=url) yield obj # 找到全部頁碼標籤 # hxs.select('//div[@id="dig_lcpage"]//a/@href').extract() page_url_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href,"/all/hot/recent/\d+")]/@href').extract() for url in page_url_list: url = "http://dig.chouti.com" + url obj = Request(url=url,callback=self.parse,headers={},cookies={}) yield obj
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class Sp1Item(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() url = scrapy.Field()
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html class Sp1Pipeline(object): def __init__(self,file_path): self.file_path = file_path self.file_obj = None @classmethod def from_crawler(cls, crawler): """ 初始化時候,用於建立pipeline對象 :param crawler: :return: """ val = crawler.settings.get('XXXXXXX') return cls(val) def process_item(self, item, spider): if spider.name == 'chouti': self.file_obj.write(item['url']) # print('pipeline-->',item) return item def open_spider(self,spider): """ 爬蟲開始執行時,只執行一次 :param spider: :return: """ self.file_obj = open(self.file_path,mode='a+') def close_spider(self,spider): """ 爬蟲關閉時,只執行一次 :param spider: :return: """ self.file_obj.close()
#!/usr/bin/env python # -*- coding:utf-8 -*- from scrapy.selector import Selector, HtmlXPathSelector from scrapy.http import HtmlResponse html = """<!DOCTYPE html> <html> <head lang="en"> <meta charset="UTF-8"> <title></title> </head> <body> <ul> <li class="item-"><a id='i1' href="link.html">first item</a></li> <li class="item-0"><a id='i2' href="llink.html">first item</a></li> <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li> </ul> <div><a href="llink2.html">second item</a></div> </body> </html> """ response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8') # hxs = HtmlXPathSelector(response) # print(hxs) # hxs = Selector(response=response).xpath('//a') # print(hxs) # hxs = Selector(response=response).xpath('//a[2]') 獲取第二個標籤 # print(hxs) # hxs = Selector(response=response).xpath('//a[@id]') 含id的a標籤 # print(hxs) # hxs = Selector(response=response).xpath('//a[@id="i1"]') id=i1的a標籤 # print(hxs) # hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]') href=link.html id=i1的a標籤 # print(hxs) # hxs = Selector(response=response).xpath('//a[contains(@href, "link")]') href包含link的a標籤 # print(hxs) # hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]') href屬性以link開頭的a標籤 # print(hxs) # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]') 正則匹配id爲i數字 # print(hxs) # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract() 取a標籤的文本 # print(hxs) # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract() 取a標籤的屬性值 # print(hxs) # hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract() # print(hxs) # hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first() # print(hxs) # ul_list = Selector(response=response).xpath('//body/ul/li') # for item in ul_list: # v = item.xpath('./a/span') # # 或 # # v = item.xpath('a/span') # # 或 # # v = item.xpath('*/a/span') # print(v)
注意:settings.py中設置DEPTH_LIMIT = 1來指定「遞歸」的層數。