在學習爬蟲入門時,會經常使用到requests模塊,熟悉這個模塊的使用須要熟悉http,https ,及瀏覽器的請求原理。初次接觸爬蟲時瞭解下,掌握瀏覽器的請求過程和爬蟲的本質,學起來就輕鬆多啦。
python
# get response = requests.get(url, headers=headers) # get 帶參數 requests.get(url, params=kw , headers=headers) # post response = requests.post(url, data=data, headers=headers)
# 建立 session實例 # get請求 session = requests.session() response = session.get(url,headers) # post請求 session = requests.session() response = session.post(post_url, data=post_data , headers=headers)
cookies = {"cookie的name":"cookie的value"}
requests.get(url,headers=headers,cookies=cookie_dict}
>>> {i:i+10 for i in range(10)} {0: 10, 1: 11, 2: 12, 3: 13, 4: 14, 5: 15, 6: 16, 7: 17, 8: 18, 9: 19} >>> {i:i+10 for i in range(10) if i%2 == 0} {0: 10, 2: 12, 4: 14, 6: 16, 8: 18}
# 準備cookie形式的字典 # cookie_dict = { i for i in cookie.spilt("; " )} cookie_dict = {i.split("=")[0]: i.split("=")[1] for i in cookie.split("; ")} response = requests.get(url, headers=headers, cookies=cookie_dict)
# coding="utf-8" import requests url = "http://www.baidu.com" response = requests.get(url) print(type(response.cookies)) ---輸出結果以下: ---<class 'requests.cookies.RequestsCookieJar'> # 把cookiejar對象轉化爲字典 cookies = requests.utils.dict_from_cookiejar(response.cookies) print(cookies) ---輸出結果以下: ---{'BDORZ': '27315'}
第1種方式:timeoutlinux
response = requests.get(url,timeout=3)
from retrying import retry @retry(stop_max_attempt_number=3) # 最大嘗試鏈接次數 def _parse_url(url): """加上一個下劃線表示該方法僅能在當前模塊內使用""" # print("-----------") response = requests.get(url, headers=headers, timeout=3) assert response.status_code == 200 return response