Requests 是使用 Apache2 Licensed 許可證的 基於Python開發的HTTP 庫,其在Python內置模塊的基礎上進行了高度的封裝,從而使得Pythoner進行網絡請求時,變得美好了許多,使用Requests能夠垂手可得的完成瀏覽器可有的任何操做。php
功能特性:css
.netrc
1 from . import sessions 2 3 # 方法 4 def request(method, url, **kwargs): 5 「」很長的一段註釋「」 6 with sessions.Session() as session: 7 return session.request(method=method, url=url, **kwargs) 8 9 # 下面的方法都基於request實現 10 def get(url, params=None, **kwargs): 11 pass 12 13 def options(url, **kwargs): 14 pass 15 16 def head(url, **kwargs): 17 pass 18 19 def post(url, data=None, json=None, **kwargs): 20 pass 21 22 def put(url, data=None, **kwargs): 23 pass 24 25 def patch(url, data=None, **kwargs): 26 pass 27 28 def delete(url, **kwargs): 29 pass
1 def request(method, url, **kwargs): 2 """Constructs and sends a :class:`Request <Request>`. 3 4 :param method: method for the new :class:`Request` object. 5 :param url: URL for the new :class:`Request` object. 6 :param params: (optional) Dictionary, list of tuples or bytes to send 7 in the body of the :class:`Request`. 8 :param data: (optional) Dictionary, list of tuples, bytes, or file-like 9 object to send in the body of the :class:`Request`. 10 :param json: (optional) A JSON serializable Python object to send in the body of the :class:`Request`. 11 :param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`. 12 :param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`. 13 :param files: (optional) Dictionary of ``'name': file-like-objects`` (or ``{'name': file-tuple}``) for multipart encoding upload. 14 ``file-tuple`` can be a 2-tuple ``('filename', fileobj)``, 3-tuple ``('filename', fileobj, 'content_type')`` 15 or a 4-tuple ``('filename', fileobj, 'content_type', custom_headers)``, where ``'content-type'`` is a string 16 defining the content type of the given file and ``custom_headers`` a dict-like object containing additional headers 17 to add for the file. 18 :param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth. 19 :param timeout: (optional) How many seconds to wait for the server to send data 20 before giving up, as a float, or a :ref:`(connect timeout, read 21 timeout) <timeouts>` tuple. 22 :type timeout: float or tuple 23 :param allow_redirects: (optional) Boolean. Enable/disable GET/OPTIONS/POST/PUT/PATCH/DELETE/HEAD redirection. Defaults to ``True``. 24 :type allow_redirects: bool 25 :param proxies: (optional) Dictionary mapping protocol to the URL of the proxy. 26 :param verify: (optional) Either a boolean, in which case it controls whether we verify 27 the server's TLS certificate, or a string, in which case it must be a path 28 to a CA bundle to use. Defaults to ``True``. 29 :param stream: (optional) if ``False``, the response content will be immediately downloaded. 30 :param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair. 31 :return: :class:`Response <Response>` object 32 :rtype: requests.Response 33 34 Usage:: 35 36 >>> import requests 37 >>> req = requests.request('GET', 'https://httpbin.org/get') 38 <Response [200]> 39 """ 40 41 # By using the 'with' statement we are sure the session is closed, thus we 42 # avoid leaving sockets open which can trigger a ResourceWarning in some 43 # cases, and look like a memory leak in others. 44 with sessions.Session() as session: 45 return session.request(method=method, url=url, **kwargs)
1 def param_method_url(): 2 # requests.request(method='get', url='http://127.0.0.1:8000/test/') 3 # requests.request(method='post', url='http://127.0.0.1:8000/test/') 4 pass 5 6 7 def param_param(): 8 # - 能夠是字典 9 # - 能夠是字符串 10 # - 能夠是字節(ascii編碼之內) 11 12 # requests.request(method='get', 13 # url='http://127.0.0.1:8000/test/', 14 # params={'k1': 'v1', 'k2': '水電費'}) 15 16 # requests.request(method='get', 17 # url='http://127.0.0.1:8000/test/', 18 # params="k1=v1&k2=水電費&k3=v3&k3=vv3") 19 20 # requests.request(method='get', 21 # url='http://127.0.0.1:8000/test/', 22 # params=bytes("k1=v1&k2=k2&k3=v3&k3=vv3", encoding='utf8')) 23 24 # 錯誤 25 # requests.request(method='get', 26 # url='http://127.0.0.1:8000/test/', 27 # params=bytes("k1=v1&k2=水電費&k3=v3&k3=vv3", encoding='utf8')) 28 pass 29 30 31 def param_data(): 32 # 能夠是字典 33 # 能夠是字符串 34 # 能夠是字節 35 # 能夠是文件對象 36 37 # requests.request(method='POST', 38 # url='http://127.0.0.1:8000/test/', 39 # data={'k1': 'v1', 'k2': '水電費'}) 40 41 # requests.request(method='POST', 42 # url='http://127.0.0.1:8000/test/', 43 # data="k1=v1; k2=v2; k3=v3; k3=v4" 44 # ) 45 46 # requests.request(method='POST', 47 # url='http://127.0.0.1:8000/test/', 48 # data="k1=v1;k2=v2;k3=v3;k3=v4", 49 # headers={'Content-Type': 'application/x-www-form-urlencoded'} 50 # ) 51 52 # requests.request(method='POST', 53 # url='http://127.0.0.1:8000/test/', 54 # data=open('data_file.py', mode='r', encoding='utf-8'), # 文件內容是:k1=v1;k2=v2;k3=v3;k3=v4 55 # headers={'Content-Type': 'application/x-www-form-urlencoded'} 56 # ) 57 pass 58 59 60 def param_json(): 61 # 將json中對應的數據進行序列化成一個字符串,json.dumps(...) 62 # 而後發送到服務器端的body中,而且Content-Type是 {'Content-Type': 'application/json'} 63 requests.request(method='POST', 64 url='http://127.0.0.1:8000/test/', 65 json={'k1': 'v1', 'k2': '水電費'}) 66 67 68 def param_headers(): 69 # 發送請求頭到服務器端 70 requests.request(method='POST', 71 url='http://127.0.0.1:8000/test/', 72 json={'k1': 'v1', 'k2': '水電費'}, 73 headers={'Content-Type': 'application/x-www-form-urlencoded'} 74 ) 75 76 77 def param_cookies(): 78 # 發送Cookie到服務器端 79 requests.request(method='POST', 80 url='http://127.0.0.1:8000/test/', 81 data={'k1': 'v1', 'k2': 'v2'}, 82 cookies={'cook1': 'value1'}, 83 ) 84 # 也可使用CookieJar(字典形式就是在此基礎上封裝) 85 from http.cookiejar import CookieJar 86 from http.cookiejar import Cookie 87 88 obj = CookieJar() 89 obj.set_cookie(Cookie(version=0, name='c1', value='v1', port=None, domain='', path='/', secure=False, expires=None, 90 discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False, 91 port_specified=False, domain_specified=False, domain_initial_dot=False, path_specified=False) 92 ) 93 requests.request(method='POST', 94 url='http://127.0.0.1:8000/test/', 95 data={'k1': 'v1', 'k2': 'v2'}, 96 cookies=obj) 97 98 99 def param_files(): 100 # 發送文件 101 # file_dict = { 102 # 'f1': open('readme', 'rb') 103 # } 104 # requests.request(method='POST', 105 # url='http://127.0.0.1:8000/test/', 106 # files=file_dict) 107 108 # 發送文件,定製文件名 109 # file_dict = { 110 # 'f1': ('test.txt', open('readme', 'rb')) 111 # } 112 # requests.request(method='POST', 113 # url='http://127.0.0.1:8000/test/', 114 # files=file_dict) 115 116 # 發送文件,定製文件名 117 # file_dict = { 118 # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf") 119 # } 120 # requests.request(method='POST', 121 # url='http://127.0.0.1:8000/test/', 122 # files=file_dict) 123 124 # 發送文件,定製文件名 125 # file_dict = { 126 # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'}) 127 # } 128 # requests.request(method='POST', 129 # url='http://127.0.0.1:8000/test/', 130 # files=file_dict) 131 132 pass 133 134 135 def param_auth(): 136 from requests.auth import HTTPBasicAuth, HTTPDigestAuth 137 138 ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf')) 139 print(ret.text) 140 141 # ret = requests.get('http://192.168.1.1', 142 # auth=HTTPBasicAuth('admin', 'admin')) 143 # ret.encoding = 'gbk' 144 # print(ret.text) 145 146 # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass')) 147 # print(ret) 148 # 149 150 151 def param_timeout(): 152 # ret = requests.get('http://google.com/', timeout=1) 153 # print(ret) 154 155 # ret = requests.get('http://google.com/', timeout=(5, 1)) 156 # print(ret) 157 pass 158 159 160 def param_allow_redirects(): 161 ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False) 162 print(ret.text) 163 164 165 def param_proxies(): 166 # proxies = { 167 # "http": "61.172.249.96:80", 168 # "https": "http://61.185.219.126:3128", 169 # } 170 171 # proxies = {'http://10.20.1.128': 'http://10.10.1.10:5323'} 172 173 # ret = requests.get("http://www.proxy360.cn/Proxy", proxies=proxies) 174 # print(ret.headers) 175 176 177 # from requests.auth import HTTPProxyAuth 178 # 179 # proxyDict = { 180 # 'http': '77.75.105.165', 181 # 'https': '77.75.105.165' 182 # } 183 # auth = HTTPProxyAuth('username', 'mypassword') 184 # 185 # r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth) 186 # print(r.text) 187 188 pass 189 190 191 def param_stream(): 192 ret = requests.get('http://127.0.0.1:8000/test/', stream=True) 193 print(ret.content) 194 ret.close() 195 196 # from contextlib import closing 197 # with closing(requests.get('http://httpbin.org/get', stream=True)) as r: 198 # # 在此處理響應。 199 # for i in r.iter_content(): 200 # print(i) 201 202 203 def requests_session(): 204 import requests 205 206 session = requests.Session() 207 208 ### 一、首先登錄任何頁面,獲取cookie 209 210 i1 = session.get(url="http://dig.chouti.com/help/service") 211 212 ### 二、用戶登錄,攜帶上一次的cookie,後臺對cookie中的 gpsd 進行受權 213 i2 = session.post( 214 url="http://dig.chouti.com/login", 215 data={ 216 'phone': "8615131255089", 217 'password': "xxxxxx", 218 'oneMonth': "" 219 } 220 ) 221 222 i3 = session.post( 223 url="http://dig.chouti.com/link/vote?linksId=8589623", 224 ) 225 print(i3.text)
使用注意:html
# post中的參數json會自動序列化,可是轉成字節的時候會使用Latin編碼,形成中文沒法顯示,能夠本身序列化轉成字節,經過data參數,傳參,json參數處理好後也是傳給data參數 rep = requests.post(url_send_msged,data=bytes(json.dumps(data_dict,ensure_ascii=False),encoding='utf8'))
response.text 類型:str 解碼類型: 根據HTTP 頭部對響應的編碼做出有根據的推測,推測的文本編碼 如何修改編碼方式:response.encoding=」gbk」 response.content 類型:bytes 解碼類型: 沒有指定 如何修改編碼方式:response.content.deocde(「utf8」)
1 #data: 2 request.post( 3 url='xx', 4 data={'k1':'v1,'k2':'v2'} 5 ) 6 #數據:POST / http1.1\r\nContent-type:urlencode-form.......\r\n\r\nk1=v1&k2=v2 7 8 9 request.post( 10 url='xx', 11 data=json.dumps({'k1':'v1,'k2':'v2'}) 12 ) 13 #數據: POST / http1.1\r\n....\r\n\r\n{'k1':'v1,'k2':'v2'} 14 15 request.post( 16 url='xx', 17 data=b'asdfasdf' 18 ) 19 #數據: POST / http1.1\r\n....\r\n\r\n'asdfasdf' 20 # json: 21 request.post( 22 url='xx', 23 json={'k1':'v1,'k2':'v2'} # 只接受字典 24 ) 25 #數據: POST / http1.1\r\nContent-type:application/json....\r\n\r\n{'k1':'v1,'k2':'v2'} 26 27 post請求data,json不一樣的傳參請求協議
1 #方式一: 2 request.post( 3 url='xx', 4 data={'k1':'v1,'k2':'v2'} 5 ) 6 #數據: POST / http1.1\r\nContent-type:urlencode-form.......\r\n\r\nk1=v1&k2=v2 7 8 9 request.POST必然能夠獲取到值。 10 - content-type: urlencode-form 11 - 數據格式:k1=v1&k2=v2 12 13 #方式二: 14 request.post( 15 url='xx', 16 json={'k1':'v1,'k2':'v2'} 17 ) 18 #數據: POST / http1.1\r\nContent-type:application/json....\r\n\r\n{'k1':'v1,'k2':'v2'} 19 request.body 20 字節 = {'k1':'v1,'k2':'v2'} 21 字節轉換字符串 22 反序列化字符串 -> 字典 23
Beautiful Soup 是一個能夠從HTML或XML文件中提取數據的Python庫.它可以經過你喜歡的轉換器實現慣用的文檔導航,查找,修改文檔的方式.Beautiful Soup會幫你節省數小時甚至數天的工做時間.node
pip3 install beautifulsoup4 # 注意不要安裝錯了
1 from bs4 import BeautifulSoup 2 3 html_doc = """ 4 <html><head><title>The Dormouse's story</title></head> 5 <body> 6 asdf 7 <div class="title"> 8 <b>The Dormouse's story總共</b> 9 <h1>f</h1> 10 </div> 11 <div class="story">Once upon a time there were three little sisters; and their names were 12 <a class="sister0" id="link1">Els<span>f</span>ie</a>, 13 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 14 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 15 and they lived at the bottom of a well.</div> 16 ad<br/>sf 17 <p class="story">...</p> 18 </body> 19 </html> 20 """ 21 22 soup = BeautifulSoup(html_doc, features="lxml") 23 # 找到第一個a標籤 24 tag1 = soup.find(name='a') 25 # 找到全部的a標籤 26 tag2 = soup.find_all(name='a') 27 # 找到id=link2的標籤 28 tag3 = soup.select('#link2')
1 # 遍歷文檔樹:即直接經過標籤名字選擇,特色是選擇速度快,但若是存在多個相同的標籤則只返回第一個 2 html_doc = """ 3 <html><head><title>The Dormouse's story</title></head> 4 <body> 5 <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b></p> 6 7 <p class="story">Once upon a time there were three little sisters; and their names were 8 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, 9 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 10 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 11 and they lived at the bottom of a well. 12 </p> 13 14 <p class="story">...</p> 15 """ 16 17 # 一、用法 18 from bs4 import BeautifulSoup 19 20 soup = BeautifulSoup(html_doc, 'lxml') 21 # soup=BeautifulSoup(open('a.html'),'lxml') 22 23 print(soup.p) # 存在多個相同的標籤則只返回第一個 <p class="title" id="my p"><b class="boldest" id="bbb">The Dormouse's story</b></p> 24 25 26 # 二、獲取標籤的名稱 27 print(soup.p.name,type(soup.p.name)) # p <class 'str'> 28 29 # 三、獲取標籤的屬性 30 print(soup.p.attrs) # {'id': 'my p', 'class': ['title']} 31 32 # 四、獲取標籤的內容 33 print(soup.p.string) # p下的文本只有一個時,取到,不然爲None The Dormouse's story 34 print(soup.p.strings) # 拿到一個生成器對象, 取到p下全部的文本內容 <generator object _all_strings at 0x000001DE2DBA83B8> 35 print(soup.p.text) # 取到p下全部的文本內容 The Dormouse's story 36 for line in soup.stripped_strings: # 取標籤中的內容並去掉空白,所有是空格的行會被忽略掉,段首和段末的空白會被刪除 37 print(line) 38 39 ''' 40 若是tag包含了多個子節點,tag就沒法肯定 .string 方法應該調用哪一個子節點的內容, .string 的輸出結果是 None,若是隻有一個子節點那麼就輸出該子節點的文本,好比下面的這種結構,soup.p.string 返回爲None,但soup.p.strings就能夠找到全部文本 41 <p id='list-1'> 42 哈哈哈哈 43 <a class='sss'> 44 <span> 45 <h1>aaaa</h1> 46 </span> 47 </a> 48 <b>bbbbb</b> 49 </p> 50 ''' 51 52 # 五、嵌套選擇 53 print(soup.head.title.string) # The Dormouse's story 54 print(soup.body.a.string) # Elsie 55 56 # 六、子節點、子孫節點 57 print(soup.p.contents) # p下全部子節點 [<b class="boldest" id="bbb">The Dormouse's story</b>] 58 print(soup.p.children) # 獲得一個迭代器,包含p下全部子節點 <list_iterator object at 0x000001E9AE8F2DD8> 59 60 for i, child in enumerate(soup.p.children): 61 print(i, child) 62 63 print(soup.p.descendants) # 獲取子孫節點,p下全部的標籤都會選擇出來 <generator object descendants at 0x0000015EDA4F82B0> 64 for i, child in enumerate(soup.p.descendants): 65 print(i, child) 66 67 # 七、父節點、祖先節點 68 print(soup.a.parent) # 獲取a標籤的父節點 <p class="story">Once upon a tie... <a ...</a>, <a class="ser...nk2">Lacie</a> and .....</p> 69 print(soup.a.parents) # 找到a標籤全部的祖先節點,父親的父親,父親的父親的父親... <generator object parents at 0x00000250B4CB82B0> 70 71 # 八、兄弟節點 72 print('=====>') 73 print(soup.a.next_sibling) # 下一個兄弟 , 74 print(soup.a.previous_sibling) # 上一個兄弟 Once upon a time there were three little sisters; and their names were 75 76 print(soup.a.next_siblings) # 下面的兄弟們=>生成器對象 <generator object next_siblings at 0x000002653C5282B0> 77 print(soup.a.previous_siblings) # 上面的兄弟們=>生成器對象 <generator object previous_siblings at 0x000002A2537F82B0>
1 # 搜索文檔樹:BeautifulSoup定義了不少搜索方法,這裏着重介紹2個: find() 和 find_all() .其它方法的參數和用法相似 2 html_doc = """ 3 <html><head><title>The Dormouse's story</title></head> 4 <body> 5 <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b> 6 </p> 7 8 <p class="story">Once upon a time there were three little sisters; and their names were 9 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, 10 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 11 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 12 and they lived at the bottom of a well.</p> 13 14 <p class="story">...</p> 15 """ 16 17 from bs4 import BeautifulSoup 18 soup=BeautifulSoup(html_doc,'lxml') 19 20 #1.一、字符串:即標籤名 21 print(soup.find_all('b')) # [<b class="boldest" id="bbb">The Dormouse's story</b>] 22 23 #1.二、正則表達式 24 import re 25 print(soup.find_all(re.compile('^b'))) #找出b開頭的標籤,結果有body和b標籤 [<body>...</body>,<b clas ... </b>] 26 27 #1.三、列表:若是傳入列表參數,Beautiful Soup會將與列表中任一元素匹配的內容返回.下面代碼找到文檔中全部<a>標籤和<b>標籤: 28 print(soup.find_all(['a','b'])) 29 30 #1.四、True:能夠匹配任何值,下面代碼查找到全部的tag,可是不會返回字符串節點 31 print(soup.find_all(True)) 32 for tag in soup.find_all(True): 33 print(tag.name) 34 35 #1.五、方法:若是沒有合適過濾器,那麼還能夠定義一個方法,方法只接受一個元素參數 , 36 # 若是這個方法返回 True 表示當前元素匹配而且被找到,若是不是則反回 False 37 def has_class_but_no_id(tag): 38 return tag.has_attr('class') and not tag.has_attr('id') 39 40 print(soup.find_all(has_class_but_no_id))
# 搜索文檔樹:BeautifulSoup定義了不少搜索方法,這裏着重介紹2個: find() 和 find_all() .其它方法的參數和用法相似 html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b> </p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import BeautifulSoup import re soup = BeautifulSoup(html_doc, 'lxml') # 二、find_all( name , attrs , recursive , text , **kwargs ) name 參數能夠查找全部名字爲 name 的tag,字符串對象會被自動忽略掉. # 2.一、name: 搜索name參數的值可使任一類型的 過濾器 ,字符串,正則表達式,列表,方法或是 True . print(soup.find_all(name=re.compile('^t'))) # 2.二、keyword: key=value的形式,value能夠是過濾器:字符串 , 正則表達式 , 列表, True . print(soup.find_all(id=re.compile('my'))) print(soup.find_all(href=re.compile('lacie'), id=re.compile('\d'))) # 注意類要用class_ print(soup.find_all(id=True)) # 查找有id屬性的標籤 # 有些tag屬性在搜索不能使用,好比HTML5中的 data-* 屬性: data_soup = BeautifulSoup('<div data-foo="value">foo!</div>', 'lxml') # data_soup.find_all(data-foo="value") #報錯:SyntaxError: keyword can't be an expression # 可是能夠經過 find_all() 方法的 attrs 參數定義一個字典參數來搜索包含特殊屬性的tag: print(data_soup.find_all(attrs={"data-foo": "value"})) # [<div data-foo="value">foo!</div>] # 2.三、按照類名查找,注意關鍵字是class_,class_=value,value能夠是五種選擇器之一 print(soup.find_all('a', class_='sister')) # 查找類爲sister的a標籤 print(soup.find_all('a', class_='sister ssss')) # 查找類爲sister和sss的a標籤,順序錯誤也匹配不成功 [] print(soup.find_all(class_=re.compile('^sis'))) # 查找類爲sister的全部標籤 # 2.四、attrs print(soup.find_all('p', attrs={'class': 'story'})) # 2.五、text: 值能夠是:字符,列表,True,正則 print(soup.find_all(text='Elsie')) print(soup.find_all('a', text='Elsie')) # 2.六、limit參數:若是文檔樹很大那麼搜索會很慢.若是咱們不須要所有結果,可使用 limit 參數限制返回結果的數量.效果與SQL中的limit關鍵字相似,當搜索到的結果數量達到 limit 的限制時,就中止搜索返回結果 print(soup.find_all('a', limit=2)) # 2.七、recursive:調用tag的 find_all() 方法時,Beautiful Soup會檢索當前tag的全部子孫節點,若是隻想搜索tag的直接子節點,可使用參數 recursive=False . print(soup.html.find_all('a')) print(soup.html.find_all('a', recursive=False)) # [] ''' 像調用 find_all() 同樣調用tag find_all() 幾乎是Beautiful Soup中最經常使用的搜索方法,因此咱們定義了它的簡寫方法. BeautifulSoup 對象和 tag 對象能夠被看成一個方法來使用,這個方法的執行結果與調用這個對象的 find_all() 方法相同,下面兩行代碼是等價的: soup.find_all("a") soup("a") 這兩行代碼也是等價的: soup.title.find_all(text=True) ["The Dormouse's story"] soup.title(text=True) ["The Dormouse's story"] '''
1 # 搜索文檔樹:BeautifulSoup定義了不少搜索方法,這裏着重介紹2個: find() 和 find_all() .其它方法的參數和用法相似 2 html_doc = """ 3 <html><head><title>The Dormouse's story</title></head> 4 <body> 5 <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b> 6 </p> 7 8 <p class="story">Once upon a time there were three little sisters; and their names were 9 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, 10 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 11 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 12 and they lived at the bottom of a well.</p> 13 14 <p class="story">...</p> 15 """ 16 17 from bs4 import BeautifulSoup 18 import re 19 soup = BeautifulSoup(html_doc, 'lxml') 20 21 #三、find( name , attrs , recursive , text , **kwargs ) 22 # find_all() 方法將返回文檔中符合條件的全部tag,儘管有時候咱們只想獲得一個結果.好比文檔中只有一個<body>標籤, 23 # 那麼使用 find_all() 方法來查找<body>標籤就不太合適, 使用 find_all 方法並設置 limit=1 參數不如直接使用 find() 方法.下面兩行代碼是等價的: 24 25 soup.find_all('title', limit=1) # [<title>The Dormouse's story</title>] 26 soup.find('title') # <title>The Dormouse's story</title> 27 28 # 惟一的區別是 find_all() 方法的返回結果是值包含一個元素的列表,而 find() 方法直接返回結果. 29 # find_all() 方法沒有找到目標是返回空列表, find() 方法找不到目標時,返回 None . 30 print(soup.find("nosuchtag")) # None 31 32 # soup.head.title 是 tag的名字 方法的簡寫.這個簡寫的原理就是屢次調用當前tag的 find() 方法: 33 34 soup.head.title # <title>The Dormouse's story</title> 35 soup.find("head").find("title") # <title>The Dormouse's story</title> 36 soup.a.text # Elsie
1 # 該模塊提供了select方法來支持css,詳見官網:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html#id37 2 html_doc = """ 3 <html><head><title>The Dormouse's story</title></head> 4 <body> 5 <p class="title"> 6 <b>The Dormouse's story</b> 7 Once upon a time there were three little sisters; and their names were 8 <a href="http://example.com/elsie" class="sister" id="link1"> 9 <span>Elsie</span> 10 </a> 11 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 12 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 13 <div class='panel-1'> 14 <ul class='list' id='list-1'> 15 <li class='element'>Foo</li> 16 <li class='element'>Bar</li> 17 <li class='element'>Jay</li> 18 </ul> 19 <ul class='list list-small' id='list-2'> 20 <li class='element'><h1 class='yyyy'>Foo</h1></li> 21 <li class='element xxx'>Bar</li> 22 <li class='element'>Jay</li> 23 </ul> 24 </div> 25 and they lived at the bottom of a well. 26 </p> 27 <p class="story">...</p> 28 """ 29 from bs4 import BeautifulSoup 30 soup=BeautifulSoup(html_doc,'lxml') 31 32 #一、CSS選擇器 33 print(soup.p.select('.sister')) 34 print(soup.select('.sister span')) 35 36 print(soup.select('#link1')) 37 print(soup.select('#link1 span')) 38 39 print(soup.select('#list-2 .element.xxx')) 40 41 print(soup.select('#list-2')[0].select('.element')) #能夠一直select,但其實不必,一條select就能夠了 42 43 # 二、獲取屬性 44 print(soup.select('#list-2 h1')[0].attrs) # {'class': ['yyyy']} 45 46 # 三、獲取內容 47 print(soup.select('#list-2 h1')[0].get_text())
https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html#id40python
bs總結:jquery
一、推薦使用lxml解析庫 二、講了三種選擇器:標籤選擇器,find與find_all,css選擇器 一、標籤選擇器篩選功能弱,可是速度快 二、建議使用find,find_all查詢匹配單個結果或者多個結果 三、若是對css選擇器很是熟悉建議使用select 三、記住經常使用的獲取屬性attrs和文本值get_text()的方法
selenium是一個瀏覽器自動化測試工具,它直接運行在瀏覽器中,就像真正的用戶在操做瀏覽器同樣,能夠在各大操做系統平臺,以及各大主流瀏覽器上運行。linux
它的本質是經過驅動瀏覽器,徹底模擬瀏覽器的操做,好比跳轉、輸入、點擊、下拉等,來拿到網頁渲染以後的結果git
爬蟲中使用它主要是爲了解決其它模塊沒法直接執行JavaScript代碼的問題github
文檔:https://selenium-python.readthedocs.io/web
1 import time 2 from selenium import webdriver 3 driver=webdriver.Chrome() # 彈出瀏覽器 4 try: 5 driver.get('https://www.baidu.com') 6 # 隱示等待 全部找標籤的操做的時候,沒有加載就等5秒(經常使用) 相應的還有顯示等待 元素沒加載出來,就等幾秒(少用) 7 driver.implicitly_wait(5) 8 # input = driver.find_element_by_id('kw') 9 # input.send_keys('python') 10 # input.clear() 11 time.sleep(10) 12 input = driver.find_element_by_id('kw') 13 14 login_tag = driver.find_element_by_link_text('登陸') # 帶連接的登陸元素 15 login_tag.click() 16 # find_element_by_partial_link_text 模糊查找 17 # login_tag = driver.find_element_by_partial_link_text('錄') 18 19 user_login = driver.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn') 20 user_login.click() 21 22 user_input = driver.find_element_by_id('TANGRAM__PSP_10__userName') 23 pwd_input=driver.find_element_by_name('password') 24 user_input.send_keys('cpp') 25 pwd_input.send_keys('123') 26 27 submit_btn=driver.find_element_by_id('TANGRAM__PSP_10__submit') 28 submit_btn.click() 29 time.sleep(3) 30 31 except Exception as e: 32 print(e) 33 finally: 34 driver.close()
1 import time 2 from selenium import webdriver 3 from selenium.webdriver.common.keys import Keys # 鍵盤按鍵操做 4 5 6 def get_goods(driver): 7 # find_elements_by_class_name 拿到的是列表,注意跟find_element_by_class_name 8 goods_list = driver.find_elements_by_class_name('gl-item') 9 for good in goods_list: 10 # 根據css選擇器 11 price = good.find_element_by_css_selector('.p-price i').text 12 comment = good.find_element_by_css_selector('.p-commit a').text 13 name = good.find_element_by_css_selector('.p-name a').get_attribute('title') 14 url = good.find_element_by_css_selector('.p-img a').get_attribute('href') 15 img = good.find_element_by_css_selector('.p-img img').get_attribute('src') 16 if not img: 17 img = good.find_element_by_css_selector('.p-img img').get_attribute('data-lazy-img') # 懶加載的圖片src放在data-lazy-img屬性中 18 if not img.startswith('https:'): 19 img = 'https:' + img 20 print(img) 21 print(''' 22 商品價格:%s 23 商品名稱:%s 24 商品評論:%s 25 商品圖片地址:%s 26 商品詳情地址:%s 27 ''' % (price, name, comment, img, url)) 28 29 next_page = driver.find_element_by_partial_link_text('下一頁') 30 next_page.click() 31 time.sleep(3) # 等待下一頁加載完成 32 33 get_goods(driver) 34 35 36 driver = webdriver.Chrome() # driver就是瀏覽器 37 # 必定不要忘了隱示等待 38 driver.implicitly_wait(5) 39 try: 40 driver.get('https://www.jd.com/') 41 # 取出輸入框,填入內容 42 input_tag = driver.find_element_by_id('key') 43 # 寫文字 44 # search=input('請輸入要搜索的商品:') 45 input_tag.send_keys('iphone') 46 # 至關於敲回車 47 input_tag.send_keys(Keys.ENTER) 48 get_goods(driver) 49 50 except Exception as e: 51 print(e) 52 53 finally: 54 driver.close()
1 from selenium import webdriver 2 from selenium.webdriver import ActionChains 3 from selenium.webdriver.common.by import By # 按照什麼方式查找,By.ID,By.CSS_SELECTOR 4 from selenium.webdriver.common.keys import Keys # 鍵盤按鍵操做 5 from selenium.webdriver.support import expected_conditions as EC 6 from selenium.webdriver.support.wait import WebDriverWait # 等待頁面加載某些元素 7 import time 8 9 driver = webdriver.Chrome() 10 driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') 11 driver.implicitly_wait(3) # 使用隱式等待 12 13 try: 14 # 先切換到id 爲iframeResult 的frame上 15 driver.switch_to.frame('iframeResult') # 切換到iframeResult 16 sourse=driver.find_element_by_id('draggable') 17 target=driver.find_element_by_id('droppable') 18 19 20 #方式二:不一樣的動做鏈,每次移動的位移都不一樣 21 # 至關於鼠標點擊了sourse,而且hold住了,鼠標沒有鬆 22 ActionChains(driver).click_and_hold(sourse).perform() 23 # 要拖動的距離 24 distance=target.location['x']-sourse.location['x'] 25 26 track=0 27 while track < distance: 28 # x軸每次移動2 29 ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform() 30 track+=2 31 32 # 鬆開鼠標 33 ActionChains(driver).release().perform() 34 35 time.sleep(6) 36 37 finally: 38 driver.close()
1 import requests 2 from selenium import webdriver 3 from selenium.webdriver.chrome.options import Options 4 chrome_options = Options() 5 chrome_options.add_argument('--headless') # 瀏覽器不提供可視化頁面. linux下若是系統不支持可視化不加這條會啓動失敗 6 driver=webdriver.Chrome(chrome_options=chrome_options) 7 """ 8 selenium 能夠獲取到cookie,處理成的字典,配合使用requests,給其請求其餘url傳遞cookie 9 """ 10 url = 'https://www.baidu.com/s?ie=UTF-8&wd=python' 11 driver.get(url) 12 # for i in driver.get_cookies(): 13 # print(i) 14 co = {cookie['name']: cookie['value'] for cookie in driver.get_cookies()} 15 16 resp = requests.get(url,headers={ 17 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 18 }) 19 print(resp.cookies.get_dict()) 20 print(co) 21 22 driver.quit()
1.find_element 和find_elements的區別:返回一個和返回一個列表 find_element 獲取不到的時候或報錯,find_elements 獲取不到,則返回空列表,eg: 獲取 下一頁 標籤 適合使用 find_elements 2.by_link_text和by_partial_link_text的區別:所有文本和包含某個文本 3.by_css_selector的用法: #food span.dairy.aged 4.by_xpath中獲取屬性和文本須要使用get_attribute() 和.text 5.若是頁面中含有iframe、frame,須要先調用driver.switch_to.frame的方法切換到frame中才能定位元素 6.selenium中find_element_by_class_name 只能接收一個class對應的一個值,不能傳入多個 # Compound class names not permitted
XPath 是一門在 XML 文檔中查找信息的語言。XPath 可用來在 XML 文檔中對元素和屬性進行遍歷。
XPath 是 W3C XSLT 標準的主要元素,而且 XQuery 和 XPointer 都構建於 XPath 表達之上。
所以,對 XPath 的理解是不少高級 XML 應用的基礎。
文檔:http://www.w3school.com.cn/xpath/index.asp
XML
xpath學習重點(能解決平常80%的需求了) - 獲取文本 1. `a/text()` 獲取a下的文本 2. `a//text()` 獲取a下的全部標籤的文本 3. `//a[text()='下一頁']` 選擇文本爲下一頁三個字的a標籤 - `@符號` 1. `a/@href` 2. `//ul[@id="detail-list"]` - `//` 1. 在xpath最前面表示從當前html中任意位置開始選擇 2. `li//a` 表示的是li下任何一個標籤 注意:使用xpath helper或者是chrome中的copy xpath都是從element中提取的數據,可是爬蟲獲取的是url對應的響應,每每和elements不同
節點選擇語法
通配符 | 描述 |
---|---|
* | 匹配任何元素節點。 |
@* | 匹配任何屬性節點。 |
node() | 匹配任何類型的節點。 |
路徑表達式 | 結果 |
---|---|
/bookstore/* | 選取 bookstore 元素的全部子元素。 |
//* | 選取文檔中的全部元素。 |
//title[@*] | 選取全部帶有屬性的 title 元素。 |
補充:較經常使用xpath寫法 1.包含: `//div[contains(@class,'i')]` 類名包含 i 的div
The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt.
It is unique in that it combines the speed and XML feature completeness of these libraries with
the simplicity of a native Python API, mostly compatible but superior to the well-known ElementTree API.
The latest release works with all CPython versions from 2.7 to 3.7. See the introduction for more information about background
and goals of the lxml project. Some common questions are answered in the FAQ.
官網文檔:https://lxml.de/
1 # coding=utf-8 2 from lxml import etree 3 4 text = ''' <div> <ul> 5 <li class="item-1"><a>first item</a></li> 6 <li class="item-1"><a href="link2.html">second item</a></li> 7 <li class="item-inactive"><a href="link3.html">third item</a></li> 8 <li class="item-1"><a href="link4.html">fourth item</a></li> 9 <li class="item-0"><a href="link5.html">fifth item</a> 10 </ul> </div> ''' 11 12 html = etree.HTML(text) 13 print(html, type(html)) # <Element html at 0x1e7e52faf08> <class 'lxml.etree._Element'> 14 # 查看element對象中包含的字符串 15 print(etree.tostring(html).decode()) 16 17 # 獲取class爲item-1 li下的a的herf 18 ret1 = html.xpath("//li[@class='item-1']/a/@href") 19 print(ret1, type(ret1[0])) # ['link2.html', 'link4.html'] <class 'lxml.etree._ElementUnicodeResult'> 20 21 # 獲取class爲item-1 li下的a的文本,a須要有href 22 ret2 = html.xpath("//li[@class='item-1']/a[@href]/text()") 23 print(ret2) # ['second item', 'fourth item'] 24 25 # 每一個li是一條新聞,把url和文本組成字典 26 news_dict = dict(zip(ret2, ret1)) 27 print(news_dict) # {'second item': 'link2.html', 'fourth item': 'link4.html'} 28 29 for href in ret1: 30 item = {} 31 item["href"] = href 32 item["title"] = ret2[ret1.index(href)] 33 print(item) 34 35 # 分組,根據li標籤進行分組,對每一組繼續寫xpath, 經常使用寫法 36 ret3 = html.xpath("//li[@class='item-1']") 37 print(ret3, type(ret3[1])) 38 for i in ret3: 39 item = {} 40 item["title"] = i.xpath("a/text()")[0] if len(i.xpath("./a/text()")) > 0 else None 41 item["href"] = i.xpath("./a/@href")[0] if len(i.xpath("./a/@href")) > 0 else None 42 print(item)
1. lxml可以修正HTML代碼,可是可能會改錯了 - 使用etree.tostring觀察修改以後的html的樣子,根據修改以後的html字符串寫xpath 2. lxml 可以接受bytes和str的字符串 3. 提取頁面數據的思路 1. 先分組,取到一個包含分組標籤的列表 2. 遍歷,取其中每一組進行數據的提取,確保不會形成數據的對應錯亂 - xpath方法 返回的是一個列表,注意取[0] 如: item["href"] = i.xpath("./a/@href")[0] if len(i.xpath("./a/@href")) > 0 else None item["href"] = i.xpath("//a/@href")[0] # 不能這麼寫,會從分組前的etree._Element中把全部分組都取到
1 from lxml import etree 2 # text中的註釋會把註釋如下代碼的所有抹去,形成丟失 3 text = ''' <div> <ul> 4 <li class="item-1"><a>first item</a></li> 5 <!-- <li class="item-1"><a href="link2.html">second item</a></li> --!> 6 <li class="item-inactive"><a href="link3.html">third item</a></li> 7 <li class="item-1"><a href="link4.html">fourth item</a></li> 8 <li class="item-0"><a href="link5.html">fifth item</a> 9 </ul> </div> ''' 10 11 html = etree.HTML(text) 12 # # 查看element對象中包含的字符串 13 print(etree.tostring(html).decode()) 14 """ 15 <html><body><div> <ul> 16 <li class="item-1"><a>first item</a></li> 17 </ul></div></body></html> 18 """
介紹:retrying是一個python的重試包,能夠用來自動重試一些可能運行失敗的程序段,retrying提供一個裝飾器函數retry,被裝飾的函數就會在運行失敗的狀況下從新執行,默認只要一直報錯就會不斷重試。
1. stop_max_attempt_number:用來設定最大的嘗試次數,超過該次數就會中止 2.stop_max_delay:從被裝飾的函數開始執行的時間點開始到函數成功運行結束或失敗報錯停止的時間點。單位:毫秒 3.wait_fixed:設置在兩次retrying之間的停留時間 4.retry_on_exception:指定出現哪些異常的時候再去retry 例:* retry_on_exception(retry_if_io_error) 5.retry_on_result:指定要在獲得哪些結果再去retry retrying是一個python的重試包,能夠用來自動重試一些可能運行失敗的程序段,retrying提供一個裝飾器函數retry,被裝飾的函數就會在運行失敗的狀況下從新執行,默認只要一直報錯就會不斷重試。 6.stop_max_attempt_number:用來設定最大的嘗試次數,超過該次數就會中止 7.stop_max_delay:從被裝飾的函數開始執行的時間點開始到函數成功運行結束或失敗報錯停止的時間點。單位:毫秒 8.wait_fixed:設置在兩次retrying之間的停留時間 9.retry_on_exception:指定出現哪些異常的時候再去retry 例:retry_on_exception(retry_if_io_error) 10.retry_on_result:指定要在獲得哪些結果再去retry 例:retry_on_result(retry_if_result_none)
1 通常裝飾器api 2 特定的中止條件(限制嘗試次數) 3 特定的等待條件(每次嘗試之間的指數增加的時間等待) 4 自定義的異常進行嘗試 5 自定義的異常進行嘗試返回結果 6 最簡單的一個使用方法是不管有任何異常出現,都會一直從新調用一個函數、方法,直到返回一個值
1 import random 2 from retrying import retry 3 4 @retry(stop_max_attempt_number=3) # 最多嘗試2次,第三次若是還拋出異常,則再也不嘗試,也拋出異常 5 def do_something_unreliable(): 6 if random.randint(0, 10) > 1: 7 print( "just have a test") 8 raise IOError("Broken sauce, everything is hosed!!!111one") 9 else: 10 return "Awesome sauce!" 11 12 print( do_something_unreliable())
Tesseract是一個開源的ocr引擎,能夠開箱即用。
安裝:https://blog.csdn.net/u010454030/article/details/80515501
爬蟲中使用的不是不少,對驗證碼識別,爲了準確率,通常使用 打碼平臺 如:雲打碼 http://www.yundama.com/apidoc/
參考:
1.http://docs.python-requests.org/zh_CN/latest/index.html
2.https://www.cnblogs.com/wupeiqi/articles/6283017.html
3.http://docs.python-requests.org/en/master/
4.https://www.crummy.com/software/BeautifulSoup/bs4/doc/
5.https://www.cnblogs.com/liuqingzheng/articles/10261331.html
6.https://www.cnblogs.com/liuqingzheng/articles/9146902.html